From 4aec060f6de410698d5b0a5bffd42d4327b258e4 Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Tue, 6 Apr 2021 20:30:22 +0200
Subject: common/threadsafe_queue: Provide Wait() method.

It shall block until there is something to consume in the queue.

And use it for the GPU emulation instead of the spin loop.
This is only in booting the emulator, however in BOTW this is the case for about 1 second.
---
 src/video_core/gpu_thread.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'src/video_core/gpu_thread.cpp')

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 99353f15f..cd59a7faf 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -29,8 +29,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
     system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
-    while (state.queue.Empty())
-        ;
+    state.queue.Wait();
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {
-- 
cgit v1.2.3


From 5145133a604f626c05f832465ac22019b003c32a Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 08:42:54 +0200
Subject: video_core/gpu_thread: Implement a ShutDown method.

This was implicitly done by `is_powered_on = false`, however the explicit method allows us to block until the GPU is actually gone.

This should fix a race condition while removing the other subsystems while the GPU is still active.
---
 src/video_core/gpu_thread.cpp | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'src/video_core/gpu_thread.cpp')

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index cd59a7faf..6b8f06f78 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -68,13 +68,7 @@ ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
     : system{system_}, is_async{is_async_} {}
 
 ThreadManager::~ThreadManager() {
-    if (!thread.joinable()) {
-        return;
-    }
-
-    // Notify GPU thread that a shutdown is pending
-    PushCommand(EndProcessingCommand());
-    thread.join();
+    ShutDown();
 }
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
@@ -132,10 +126,26 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
 
 void ThreadManager::WaitIdle() const {
     while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
-           system.IsPoweredOn()) {
+           state.is_running) {
     }
 }
 
+void ThreadManager::ShutDown() {
+    if (!state.is_running) {
+        return;
+    }
+
+    state.is_running = false;
+
+    if (!thread.joinable()) {
+        return;
+    }
+
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
+    thread.join();
+}
+
 void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
-- 
cgit v1.2.3


From e6fb49fa4bb2864702abcefc14f6bb62eaba7a7e Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 13:57:49 +0200
Subject: video_core/gpu_thread: Keep the write lock for allocating the fence.

Else the fence might get submited out-of-order into the queue, which makes testing them pointless.
Overhead should be tiny as the mutex is just moved from the queue to the writing code.
---
 src/video_core/gpu_thread.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/video_core/gpu_thread.cpp')

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 6b8f06f78..9488bf544 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -151,11 +151,13 @@ void ThreadManager::OnCommandListEnd() {
 }
 
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
+    std::unique_lock lk(state.write_lock);
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
 
     if (!is_async) {
         // In synchronous GPU mode, block the caller until the command has executed
+        lk.unlock();
         WaitIdle();
     }
 
-- 
cgit v1.2.3


From e8bd9aed8bf0f60455d0ae6a8f6f3abf92dd8305 Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 7 Apr 2021 11:41:31 +0200
Subject: video_core: Use a CV for blocking commands.

There is no need for a busy loop here. Let's just use a condition variable to save some power.
---
 src/video_core/gpu_thread.cpp | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'src/video_core/gpu_thread.cpp')

diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 9488bf544..7addfbc7b 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -56,11 +56,17 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
         } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
             rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
         } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
-            return;
+            ASSERT(state.is_running == false);
         } else {
             UNREACHABLE();
         }
         state.signaled_fence.store(next.fence);
+        if (next.block) {
+            // We have to lock the write_lock to ensure that the condition_variable wait not get a
+            // race between the check and the lock itself.
+            std::lock_guard lk(state.write_lock);
+            state.cv.notify_all();
+        }
     }
 }
 
@@ -105,9 +111,8 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) {
     case Settings::GPUAccuracy::Extreme: {
         auto& gpu = system.GPU();
         u64 fence = gpu.RequestFlush(addr, size);
-        PushCommand(GPUTickCommand());
-        while (fence > gpu.CurrentFlushRequestFence()) {
-        }
+        PushCommand(GPUTickCommand(), true);
+        ASSERT(fence <= gpu.CurrentFlushRequestFence());
         break;
     }
     default:
@@ -124,18 +129,16 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     rasterizer->OnCPUWrite(addr, size);
 }
 
-void ThreadManager::WaitIdle() const {
-    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
-           state.is_running) {
-    }
-}
-
 void ThreadManager::ShutDown() {
     if (!state.is_running) {
         return;
     }
 
-    state.is_running = false;
+    {
+        std::lock_guard lk(state.write_lock);
+        state.is_running = false;
+        state.cv.notify_all();
+    }
 
     if (!thread.joinable()) {
         return;
@@ -150,15 +153,21 @@ void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
 
-u64 ThreadManager::PushCommand(CommandData&& command_data) {
+u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
+    if (!is_async) {
+        // In synchronous GPU mode, block the caller until the command has executed
+        block = true;
+    }
+
     std::unique_lock lk(state.write_lock);
     const u64 fence{++state.last_fence};
-    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
+    state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
 
-    if (!is_async) {
-        // In synchronous GPU mode, block the caller until the command has executed
-        lk.unlock();
-        WaitIdle();
+    if (block) {
+        state.cv.wait(lk, [this, fence] {
+            return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
+                   !state.is_running;
+        });
     }
 
     return fence;
-- 
cgit v1.2.3