From abea6fa90c901d0b47487ed38d44511b18f0addf Mon Sep 17 00:00:00 2001 From: bunnei Date: Fri, 23 Nov 2018 23:20:56 -0500 Subject: gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). --- src/video_core/dma_pusher.h | 95 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/video_core/dma_pusher.h (limited to 'src/video_core/dma_pusher.h') diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h new file mode 100644 index 000000000..39d98e46e --- /dev/null +++ b/src/video_core/dma_pusher.h @@ -0,0 +1,95 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/memory_manager.h" + +namespace Tegra { + +enum class SubmissionMode : u32 { + IncreasingOld = 0, + Increasing = 1, + NonIncreasingOld = 2, + NonIncreasing = 3, + Inline = 4, + IncreaseOnce = 5 +}; + +struct CommandListHeader { + union { + u64 raw; + BitField<0, 40, GPUVAddr> addr; + BitField<41, 1, u64> is_non_main; + BitField<42, 21, u64> size; + }; +}; +static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size"); + +union CommandHeader { + u32 argument; + BitField<0, 13, u32> method; + BitField<0, 24, u32> method_count_; + BitField<13, 3, u32> subchannel; + BitField<16, 13, u32> arg_count; + BitField<16, 13, u32> method_count; + BitField<29, 3, SubmissionMode> mode; +}; +static_assert(std::is_standard_layout_v, "CommandHeader is not standard layout"); +static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); + +class GPU; + +/** + * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the + * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled + * into a "command stream" consisting of 32-bit words that make up "commands". + * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for + * details on this implementation. + */ +class DmaPusher { +public: + explicit DmaPusher(GPU& gpu); + ~DmaPusher(); + + void Push(const CommandListHeader& command_list_header) { + dma_pushbuffer.push(command_list_header); + } + + void DispatchCalls(); + +private: + bool Step(); + + void SetState(const CommandHeader& command_header); + + void CallMethod(u32 argument) const; + + GPU& gpu; + + std::queue dma_pushbuffer; + + struct DmaState { + u32 method; ///< Current method + u32 subchannel; ///< Current subchannel + u32 method_count; ///< Current method count + u32 length_pending; ///< Large NI command length pending + bool non_incrementing; ///< Current command’s NI flag + }; + + DmaState dma_state{}; + bool dma_increment_once{}; + + GPUVAddr dma_put{}; ///< pushbuffer current end address + GPUVAddr dma_get{}; ///< pushbuffer current read address + GPUVAddr dma_mget{}; ///< main pushbuffer last read address + bool ib_enable{true}; ///< IB mode enabled + bool non_main{}; ///< non-main pushbuffer active +}; + +} // namespace Tegra -- cgit v1.2.3 From ac74b71d7530452126792c5fa0bf01fe7378ba00 Mon Sep 17 00:00:00 2001 From: bunnei Date: Tue, 27 Nov 2018 19:17:33 -0500 Subject: dma_pushbuffer: Optimize to avoid loop and copy on Push. --- src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 14 ++++++-------- src/video_core/dma_pusher.cpp | 12 ++++++++++-- src/video_core/dma_pusher.h | 10 +++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) (limited to 'src/video_core/dma_pusher.h') diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 39a58b685..2e2b0ae1c 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector& input, std::vector< return 0; } -static void PushGPUEntries(const std::vector& entries) { +static void PushGPUEntries(Tegra::CommandList&& entries) { auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()}; - for (const auto& entry : entries) { - dma_pusher.Push(entry); - } + dma_pusher.Push(std::move(entries)); dma_pusher.DispatchCalls(); } @@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector& input, std::vector& outp params.num_entries * sizeof(Tegra::CommandListHeader), "Incorrect input size"); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; @@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector& input, std::vector& output) LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags); - std::vector entries(params.num_entries); + Tegra::CommandList entries(params.num_entries); Memory::ReadBlock(params.address, entries.data(), params.num_entries * sizeof(Tegra::CommandListHeader)); - PushGPUEntries(entries); + PushGPUEntries(std::move(entries)); params.fence_out.id = 0; params.fence_out.value = 0; diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 23ec97944..63a958f11 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() { // On entering GPU code, assume all memory may be touched by the ARM core. gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + dma_pushbuffer_subindex = 0; + while (Core::System::GetInstance().IsPoweredOn()) { if (!Step()) { break; @@ -89,11 +91,17 @@ bool DmaPusher::Step() { } } else if (ib_enable && !dma_pushbuffer.empty()) { // Current pushbuffer empty, but we have more IB entries to read - const CommandListHeader& command_list_header{dma_pushbuffer.front()}; + const CommandList& command_list{dma_pushbuffer.front()}; + const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]}; dma_get = command_list_header.addr; dma_put = dma_get + command_list_header.size * sizeof(u32); non_main = command_list_header.is_non_main; - dma_pushbuffer.pop(); + + if (dma_pushbuffer_subindex >= command_list.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } } else { // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do return {}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 39d98e46e..16e0697c4 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include "common/bit_field.h" @@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect class GPU; +using CommandList = std::vector; + /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled @@ -57,8 +60,8 @@ public: explicit DmaPusher(GPU& gpu); ~DmaPusher(); - void Push(const CommandListHeader& command_list_header) { - dma_pushbuffer.push(command_list_header); + void Push(CommandList&& entries) { + dma_pushbuffer.push(std::move(entries)); } void DispatchCalls(); @@ -72,7 +75,8 @@ private: GPU& gpu; - std::queue dma_pushbuffer; + std::queue dma_pushbuffer; ///< Queue of command lists to be processed + std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer struct DmaState { u32 method; ///< Current method -- cgit v1.2.3