From abea6fa90c901d0b47487ed38d44511b18f0addf Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Fri, 23 Nov 2018 23:20:56 -0500
Subject: gpu: Rewrite GPU command list processing with DmaPusher class.

- More accurate impl., fixes Undertale (among other games).
---
 src/video_core/dma_pusher.h | 95 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 src/video_core/dma_pusher.h

(limited to 'src/video_core/dma_pusher.h')
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
new file mode 100644
index 000000000..39d98e46e
--- /dev/null
+++ b/src/video_core/dma_pusher.h
@@ -0,0 +1,95 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+
+enum class SubmissionMode : u32 {
+    IncreasingOld = 0,
+    Increasing = 1,
+    NonIncreasingOld = 2,
+    NonIncreasing = 3,
+    Inline = 4,
+    IncreaseOnce = 5
+};
+
+struct CommandListHeader {
+    union {
+        u64 raw;
+        BitField<0, 40, GPUVAddr> addr;
+        BitField<41, 1, u64> is_non_main;
+        BitField<42, 21, u64> size;
+    };
+};
+static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size");
+
+union CommandHeader {
+    u32 argument;
+    BitField<0, 13, u32> method;
+    BitField<0, 24, u32> method_count_;
+    BitField<13, 3, u32> subchannel;
+    BitField<16, 13, u32> arg_count;
+    BitField<16, 13, u32> method_count;
+    BitField<29, 3, SubmissionMode> mode;
+};
+static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
+static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
+
+class GPU;
+
+/**
+ * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
+ * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
+ * into a "command stream" consisting of 32-bit words that make up "commands".
+ * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
+ * details on this implementation.
+ */
+class DmaPusher {
+public:
+    explicit DmaPusher(GPU& gpu);
+    ~DmaPusher();
+
+    void Push(const CommandListHeader& command_list_header) {
+        dma_pushbuffer.push(command_list_header);
+    }
+
+    void DispatchCalls();
+
+private:
+    bool Step();
+
+    void SetState(const CommandHeader& command_header);
+
+    void CallMethod(u32 argument) const;
+
+    GPU& gpu;
+
+    std::queue<CommandListHeader> dma_pushbuffer;
+
+    struct DmaState {
+        u32 method;            ///< Current method
+        u32 subchannel;        ///< Current subchannel
+        u32 method_count;      ///< Current method count
+        u32 length_pending;    ///< Large NI command length pending
+        bool non_incrementing; ///< Current command’s NI flag
+    };
+
+    DmaState dma_state{};
+    bool dma_increment_once{};
+
+    GPUVAddr dma_put{};   ///< pushbuffer current end address
+    GPUVAddr dma_get{};   ///< pushbuffer current read address
+    GPUVAddr dma_mget{};  ///< main pushbuffer last read address
+    bool ib_enable{true}; ///< IB mode enabled
+    bool non_main{};      ///< non-main pushbuffer active
+};
+
+} // namespace Tegra
-- 
cgit v1.2.3


From ac74b71d7530452126792c5fa0bf01fe7378ba00 Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Tue, 27 Nov 2018 19:17:33 -0500
Subject: dma_pushbuffer: Optimize to avoid loop and copy on Push.

---
 src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 14 ++++++--------
 src/video_core/dma_pusher.cpp                     | 12 ++++++++++--
 src/video_core/dma_pusher.h                       | 10 +++++++---
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'src/video_core/dma_pusher.h')

diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 39a58b685..2e2b0ae1c 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -128,11 +128,9 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
     return 0;
 }
 
-static void PushGPUEntries(const std::vector<Tegra::CommandListHeader>& entries) {
+static void PushGPUEntries(Tegra::CommandList&& entries) {
     auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
-    for (const auto& entry : entries) {
-        dma_pusher.Push(entry);
-    }
+    dma_pusher.Push(std::move(entries));
     dma_pusher.DispatchCalls();
 }
 
@@ -149,11 +147,11 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
                                    params.num_entries * sizeof(Tegra::CommandListHeader),
                "Incorrect input size");
 
-    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
+    Tegra::CommandList entries(params.num_entries);
     std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                 params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    PushGPUEntries(entries);
+    PushGPUEntries(std::move(entries));
 
     params.fence_out.id = 0;
     params.fence_out.value = 0;
@@ -170,11 +168,11 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
                 params.address, params.num_entries, params.flags);
 
-    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
+    Tegra::CommandList entries(params.num_entries);
     Memory::ReadBlock(params.address, entries.data(),
                       params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    PushGPUEntries(entries);
+    PushGPUEntries(std::move(entries));
 
     params.fence_out.id = 0;
     params.fence_out.value = 0;
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 23ec97944..63a958f11 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -23,6 +23,8 @@ void DmaPusher::DispatchCalls() {
     // On entering GPU code, assume all memory may be touched by the ARM core.
     gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
 
+    dma_pushbuffer_subindex = 0;
+
     while (Core::System::GetInstance().IsPoweredOn()) {
         if (!Step()) {
             break;
@@ -89,11 +91,17 @@ bool DmaPusher::Step() {
         }
     } else if (ib_enable && !dma_pushbuffer.empty()) {
         // Current pushbuffer empty, but we have more IB entries to read
-        const CommandListHeader& command_list_header{dma_pushbuffer.front()};
+        const CommandList& command_list{dma_pushbuffer.front()};
+        const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
         dma_get = command_list_header.addr;
         dma_put = dma_get + command_list_header.size * sizeof(u32);
         non_main = command_list_header.is_non_main;
-        dma_pushbuffer.pop();
+
+        if (dma_pushbuffer_subindex >= command_list.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
     } else {
         // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do
         return {};
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 39d98e46e..16e0697c4 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <vector>
 #include <queue>
 
 #include "common/bit_field.h"
@@ -45,6 +46,8 @@ static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect
 
 class GPU;
 
+using CommandList = std::vector<Tegra::CommandListHeader>;
+
 /**
  * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
  * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
@@ -57,8 +60,8 @@ public:
     explicit DmaPusher(GPU& gpu);
     ~DmaPusher();
 
-    void Push(const CommandListHeader& command_list_header) {
-        dma_pushbuffer.push(command_list_header);
+    void Push(CommandList&& entries) {
+        dma_pushbuffer.push(std::move(entries));
     }
 
     void DispatchCalls();
@@ -72,7 +75,8 @@ private:
 
     GPU& gpu;
 
-    std::queue<CommandListHeader> dma_pushbuffer;
+    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
+    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer
 
     struct DmaState {
         u32 method;            ///< Current method
-- 
cgit v1.2.3