31 files changed, 733 insertions, 532 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0c3038c52..14b76680f 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -123,6 +123,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_resource_manager.cpp
         renderer_vulkan/vk_resource_manager.h
+        renderer_vulkan/vk_sampler_cache.cpp
+        renderer_vulkan/vk_sampler_cache.h
         renderer_vulkan/vk_scheduler.cpp
         renderer_vulkan/vk_scheduler.h
         renderer_vulkan/vk_stream_buffer.cpp
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index aae2a4019..daefa43a6 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -9,6 +9,7 @@
 #include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra::Engines {
 
@@ -48,7 +49,8 @@ void KeplerMemory::ProcessData(u32 data) {
     // We have to invalidate the destination region to evict any outdated surfaces from the cache.
     // We do this before actually writing the new data because the destination address might contain
     // a dirty surface that will have to be written back to memory.
-    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32));
+    system.Renderer().Rasterizer().InvalidateRegion(ToCacheAddr(Memory::GetPointer(*dest_address)),
+                                                    sizeof(u32));
 
     Memory::Write32(*dest_address, data);
     system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 144e7fa82..49979694e 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -396,7 +396,10 @@ void Maxwell3D::ProcessCBData(u32 value) {
     const auto address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos);
     ASSERT_MSG(address, "Invalid GPU address");
 
-    Memory::Write32(*address, value);
+    u8* ptr{Memory::GetPointer(*address)};
+    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
+    std::memcpy(ptr, &value, sizeof(u32));
+
     dirty_flags.OnMemoryWrite();
 
     // Increment the current buffer position.
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 9dfea5999..415a6319a 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -9,6 +9,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
@@ -92,12 +93,14 @@ void MaxwellDMA::HandleCopy() {
     const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
         // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
         // copying.
-        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size);
+        Core::System::GetInstance().Renderer().Rasterizer().FlushRegion(
+            ToCacheAddr(Memory::GetPointer(*source_cpu)), src_size);
 
         // We have to invalidate the destination region to evict any outdated surfaces from the
         // cache. We do this before actually writing the new data because the destination address
         // might contain a dirty surface that will have to be written back to memory.
-        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size);
+        Core::System::GetInstance().Renderer().Rasterizer().InvalidateRegion(
+            ToCacheAddr(Memory::GetPointer(*dest_cpu)), dst_size);
     };
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 56a203275..a14b95c30 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -11,6 +11,11 @@
 #include "video_core/dma_pusher.h"
 #include "video_core/memory_manager.h"
 
+using CacheAddr = std::uintptr_t;
+inline CacheAddr ToCacheAddr(const void* host_ptr) {
+    return reinterpret_cast<CacheAddr>(host_ptr);
+}
+
 namespace Core {
 class System;
 }
@@ -209,13 +214,13 @@ public:
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index ad0a747e3..8b355cf7b 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -22,15 +22,15 @@ void GPUAsynch::SwapBuffers(
     gpu_thread.SwapBuffers(std::move(framebuffer));
 }
 
-void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
+void GPUAsynch::FlushRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushRegion(addr, size);
 }
 
-void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
+void GPUAsynch::InvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.InvalidateRegion(addr, size);
 }
 
-void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushAndInvalidateRegion(addr, size);
 }
 
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index e6a807aba..1dcc61a6c 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -26,9 +26,9 @@ public:
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 
 private:
     GPUThread::ThreadManager gpu_thread;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 4c00b96c7..2cfc900ed 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -22,15 +22,15 @@ void GPUSynch::SwapBuffers(
     renderer.SwapBuffers(std::move(framebuffer));
 }
 
-void GPUSynch::FlushRegion(VAddr addr, u64 size) {
+void GPUSynch::FlushRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().FlushRegion(addr, size);
 }
 
-void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
+void GPUSynch::InvalidateRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().InvalidateRegion(addr, size);
 }
 
-void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void GPUSynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
 }
 
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 7d5a241ff..766b5631c 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -21,9 +21,9 @@ public:
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c5bdd2a17..086b2f625 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -5,7 +5,6 @@
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/frontend/scope_acquire_window_context.h"
-#include "core/settings.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
@@ -13,38 +12,13 @@
 
 namespace VideoCommon::GPUThread {
 
-/// Executes a single GPU thread command
-static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
-                           Tegra::DmaPusher& dma_pusher) {
-    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
-        dma_pusher.Push(std::move(submit_list->entries));
-        dma_pusher.DispatchCalls();
-    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
-        renderer.SwapBuffers(data->framebuffer);
-    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
-        renderer.Rasterizer().FlushRegion(data->addr, data->size);
-    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
-        renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
-    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
-        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
-    } else {
-        UNREACHABLE();
-    }
-}
-
 /// Runs the GPU thread
 static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-
     MicroProfileOnThreadCreate("GpuThread");
 
-    auto WaitForWakeup = [&]() {
-        std::unique_lock<std::mutex> lock{state.signal_mutex};
-        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
-    };
-
     // Wait for first GPU command before acquiring the window context
-    WaitForWakeup();
+    state.WaitForCommands();
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {
@@ -53,100 +27,72 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 
     Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
 
+    CommandDataContainer next;
     while (state.is_running) {
-        if (!state.is_running) {
-            return;
-        }
-
-        {
-            // Thread has been woken up, so make the previous write queue the next read queue
-            std::lock_guard<std::mutex> lock{state.signal_mutex};
-            std::swap(state.push_queue, state.pop_queue);
-        }
-
-        // Execute all of the GPU commands
-        while (!state.pop_queue->empty()) {
-            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
-            state.pop_queue->pop();
+        state.WaitForCommands();
+        while (!state.queue.Empty()) {
+            state.queue.Pop(next);
+            if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
+                dma_pusher.Push(std::move(submit_list->entries));
+                dma_pusher.DispatchCalls();
+            } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
+                state.DecrementFramesCounter();
+                renderer.SwapBuffers(std::move(data->framebuffer));
+            } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
+                renderer.Rasterizer().FlushRegion(data->addr, data->size);
+            } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
+                renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
+            } else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) {
+                return;
+            } else {
+                UNREACHABLE();
+            }
         }
-
-        state.UpdateIdleState();
-
-        // Signal that the GPU thread has finished processing commands
-        if (state.is_idle) {
-            state.idle_condition.notify_one();
-        }
-
-        // Wait for CPU thread to send more GPU commands
-        WaitForWakeup();
     }
 }
 
 ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
     : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
-                                                         std::ref(dma_pusher), std::ref(state)},
-      thread_id{thread.get_id()} {}
+                                                         std::ref(dma_pusher), std::ref(state)} {}
 
 ThreadManager::~ThreadManager() {
-    {
-        // Notify GPU thread that a shutdown is pending
-        std::lock_guard<std::mutex> lock{state.signal_mutex};
-        state.is_running = false;
-    }
-
-    state.signal_condition.notify_one();
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
     thread.join();
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
-    if (entries.empty()) {
-        return;
-    }
-
-    PushCommand(SubmitListCommand(std::move(entries)), false, false);
+    PushCommand(SubmitListCommand(std::move(entries)));
 }
 
 void ThreadManager::SwapBuffers(
     std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false);
+    state.IncrementFramesCounter();
+    PushCommand(SwapBuffersCommand(std::move(framebuffer)));
+    state.WaitForFrames();
 }
 
-void ThreadManager::FlushRegion(VAddr addr, u64 size) {
-    // Block the CPU when using accurate emulation
-    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
+void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
+    PushCommand(FlushRegionCommand(addr, size));
 }
 
-void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
-    PushCommand(InvalidateRegionCommand(addr, size), true, true);
+void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
+    if (state.queue.Empty()) {
+        // It's quicker to invalidate a single region on the CPU if the queue is already empty
+        renderer.Rasterizer().InvalidateRegion(addr, size);
+    } else {
+        PushCommand(InvalidateRegionCommand(addr, size));
+    }
 }
 
-void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
+    // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
     InvalidateRegion(addr, size);
 }
 
-void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) {
-    {
-        std::lock_guard<std::mutex> lock{state.signal_mutex};
-
-        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
-            // Execute the command synchronously on the current thread
-            ExecuteCommand(&command_data, renderer, dma_pusher);
-            return;
-        }
-
-        // Push the command to the GPU thread
-        state.UpdateIdleState();
-        state.push_queue->emplace(command_data);
-    }
-
-    // Signal the GPU thread that commands are pending
-    state.signal_condition.notify_one();
-
-    if (wait_for_idle) {
-        // Wait for the GPU to be idle (all commands to be executed)
-        std::unique_lock<std::mutex> lock{state.idle_mutex};
-        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
-    }
+void ThreadManager::PushCommand(CommandData&& command_data) {
+    state.queue.Push(CommandDataContainer(std::move(command_data)));
+    state.SignalCommands();
 }
 
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index edb148b14..8cd7db1c6 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -13,6 +13,9 @@
 #include <thread>
 #include <variant>
 
+#include "common/threadsafe_queue.h"
+#include "video_core/gpu.h"
+
 namespace Tegra {
 struct FramebufferConfig;
 class DmaPusher;
@@ -24,6 +27,9 @@ class RendererBase;
 
 namespace VideoCommon::GPUThread {
 
+/// Command to signal to the GPU thread that processing has ended
+struct EndProcessingCommand final {};
+
 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
     explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
@@ -36,59 +42,110 @@ struct SwapBuffersCommand final {
     explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
         : framebuffer{std::move(framebuffer)} {}
 
-    std::optional<const Tegra::FramebufferConfig> framebuffer;
+    std::optional<Tegra::FramebufferConfig> framebuffer;
 };
 
 /// Command to signal to the GPU thread to flush a region
 struct FlushRegionCommand final {
-    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr FlushRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
 /// Command to signal to the GPU thread to invalidate a region
 struct InvalidateRegionCommand final {
-    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr InvalidateRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
 /// Command to signal to the GPU thread to flush and invalidate a region
 struct FlushAndInvalidateRegionCommand final {
-    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
+    explicit constexpr FlushAndInvalidateRegionCommand(CacheAddr addr, u64 size)
         : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
-using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+using CommandData =
+    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+
+struct CommandDataContainer {
+    CommandDataContainer() = default;
+
+    CommandDataContainer(CommandData&& data) : data{std::move(data)} {}
+
+    CommandDataContainer& operator=(const CommandDataContainer& t) {
+        data = std::move(t.data);
+        return *this;
+    }
+
+    CommandData data;
+};
 
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
-    std::atomic<bool> is_running{true};
-    std::atomic<bool> is_idle{true};
-    std::condition_variable signal_condition;
-    std::mutex signal_mutex;
-    std::condition_variable idle_condition;
-    std::mutex idle_mutex;
-
-    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and
-    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes
-    // empty. This allows for efficient thread-safe access, as it does not require any copies.
-
-    using CommandQueue = std::queue<CommandData>;
-    std::array<CommandQueue, 2> command_queues;
-    CommandQueue* push_queue{&command_queues[0]};
-    CommandQueue* pop_queue{&command_queues[1]};
-
-    void UpdateIdleState() {
-        std::lock_guard<std::mutex> lock{idle_mutex};
-        is_idle = command_queues[0].empty() && command_queues[1].empty();
+    std::atomic_bool is_running{true};
+    std::atomic_int queued_frame_count{};
+    std::mutex frames_mutex;
+    std::mutex commands_mutex;
+    std::condition_variable commands_condition;
+    std::condition_variable frames_condition;
+
+    void IncrementFramesCounter() {
+        std::lock_guard<std::mutex> lock{frames_mutex};
+        ++queued_frame_count;
+    }
+
+    void DecrementFramesCounter() {
+        {
+            std::lock_guard<std::mutex> lock{frames_mutex};
+            --queued_frame_count;
+
+            if (queued_frame_count) {
+                return;
+            }
+        }
+        frames_condition.notify_one();
     }
+
+    void WaitForFrames() {
+        {
+            std::lock_guard<std::mutex> lock{frames_mutex};
+            if (!queued_frame_count) {
+                return;
+            }
+        }
+
+        // Wait for the GPU to be idle (all commands to be executed)
+        {
+            std::unique_lock<std::mutex> lock{frames_mutex};
+            frames_condition.wait(lock, [this] { return !queued_frame_count; });
+        }
+    }
+
+    void SignalCommands() {
+        {
+            std::unique_lock<std::mutex> lock{commands_mutex};
+            if (queue.Empty()) {
+                return;
+            }
+        }
+
+        commands_condition.notify_one();
+    }
+
+    void WaitForCommands() {
+        std::unique_lock<std::mutex> lock{commands_mutex};
+        commands_condition.wait(lock, [this] { return !queue.Empty(); });
+    }
+
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    CommandQueue queue;
 };
 
 /// Class used to manage the GPU thread
@@ -105,22 +162,17 @@ public:
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    void FlushRegion(VAddr addr, u64 size);
+    void FlushRegion(CacheAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    void InvalidateRegion(VAddr addr, u64 size);
+    void InvalidateRegion(CacheAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size);
 
 private:
     /// Pushes a command to be executed by the GPU thread
-    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu);
-
-    /// Returns true if this is called by the GPU thread
-    bool IsGpuThread() const {
-        return std::this_thread::get_id() == thread_id;
-    }
+    void PushCommand(CommandData&& command_data);
 
 private:
     SynchState state;
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index b68f4fb13..9692ce143 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -16,12 +16,12 @@ namespace VideoCore {
 using Surface::GetBytesPerPixel;
 using Surface::PixelFormat;
 
-using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, std::size_t, VAddr);
+using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, VAddr);
 using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>;
 
 template <bool morton_to_linear, PixelFormat format>
 static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth,
-                       u32 tile_width_spacing, u8* buffer, std::size_t buffer_size, VAddr addr) {
+                       u32 tile_width_spacing, u8* buffer, VAddr addr) {
     constexpr u32 bytes_per_pixel = GetBytesPerPixel(format);
 
     // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
@@ -42,142 +42,138 @@ static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth
 }
 
 static constexpr ConversionArray morton_to_linear_fns = {
-    // clang-format off
-        MortonCopy<true, PixelFormat::ABGR8U>,
-        MortonCopy<true, PixelFormat::ABGR8S>,
-        MortonCopy<true, PixelFormat::ABGR8UI>,
-        MortonCopy<true, PixelFormat::B5G6R5U>,
-        MortonCopy<true, PixelFormat::A2B10G10R10U>,
-        MortonCopy<true, PixelFormat::A1B5G5R5U>,
-        MortonCopy<true, PixelFormat::R8U>,
-        MortonCopy<true, PixelFormat::R8UI>,
-        MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::RGBA16U>,
-        MortonCopy<true, PixelFormat::RGBA16UI>,
-        MortonCopy<true, PixelFormat::R11FG11FB10F>,
-        MortonCopy<true, PixelFormat::RGBA32UI>,
-        MortonCopy<true, PixelFormat::DXT1>,
-        MortonCopy<true, PixelFormat::DXT23>,
-        MortonCopy<true, PixelFormat::DXT45>,
-        MortonCopy<true, PixelFormat::DXN1>,
-        MortonCopy<true, PixelFormat::DXN2UNORM>,
-        MortonCopy<true, PixelFormat::DXN2SNORM>,
-        MortonCopy<true, PixelFormat::BC7U>,
-        MortonCopy<true, PixelFormat::BC6H_UF16>,
-        MortonCopy<true, PixelFormat::BC6H_SF16>,
-        MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
-        MortonCopy<true, PixelFormat::BGRA8>,
-        MortonCopy<true, PixelFormat::RGBA32F>,
-        MortonCopy<true, PixelFormat::RG32F>,
-        MortonCopy<true, PixelFormat::R32F>,
-        MortonCopy<true, PixelFormat::R16F>,
-        MortonCopy<true, PixelFormat::R16U>,
-        MortonCopy<true, PixelFormat::R16S>,
-        MortonCopy<true, PixelFormat::R16UI>,
-        MortonCopy<true, PixelFormat::R16I>,
-        MortonCopy<true, PixelFormat::RG16>,
-        MortonCopy<true, PixelFormat::RG16F>,
-        MortonCopy<true, PixelFormat::RG16UI>,
-        MortonCopy<true, PixelFormat::RG16I>,
-        MortonCopy<true, PixelFormat::RG16S>,
-        MortonCopy<true, PixelFormat::RGB32F>,
-        MortonCopy<true, PixelFormat::RGBA8_SRGB>,
-        MortonCopy<true, PixelFormat::RG8U>,
-        MortonCopy<true, PixelFormat::RG8S>,
-        MortonCopy<true, PixelFormat::RG32UI>,
-        MortonCopy<true, PixelFormat::R32UI>,
-        MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
-        MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
-        MortonCopy<true, PixelFormat::ASTC_2D_5X4>,
-        MortonCopy<true, PixelFormat::BGRA8_SRGB>,
-        MortonCopy<true, PixelFormat::DXT1_SRGB>,
-        MortonCopy<true, PixelFormat::DXT23_SRGB>,
-        MortonCopy<true, PixelFormat::DXT45_SRGB>,
-        MortonCopy<true, PixelFormat::BC7U_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_5X5>,
-        MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>,
-        MortonCopy<true, PixelFormat::ASTC_2D_10X8>,
-        MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>,
-        MortonCopy<true, PixelFormat::Z32F>,
-        MortonCopy<true, PixelFormat::Z16>,
-        MortonCopy<true, PixelFormat::Z24S8>,
-        MortonCopy<true, PixelFormat::S8Z24>,
-        MortonCopy<true, PixelFormat::Z32FS8>,
-    // clang-format on
+    MortonCopy<true, PixelFormat::ABGR8U>,
+    MortonCopy<true, PixelFormat::ABGR8S>,
+    MortonCopy<true, PixelFormat::ABGR8UI>,
+    MortonCopy<true, PixelFormat::B5G6R5U>,
+    MortonCopy<true, PixelFormat::A2B10G10R10U>,
+    MortonCopy<true, PixelFormat::A1B5G5R5U>,
+    MortonCopy<true, PixelFormat::R8U>,
+    MortonCopy<true, PixelFormat::R8UI>,
+    MortonCopy<true, PixelFormat::RGBA16F>,
+    MortonCopy<true, PixelFormat::RGBA16U>,
+    MortonCopy<true, PixelFormat::RGBA16UI>,
+    MortonCopy<true, PixelFormat::R11FG11FB10F>,
+    MortonCopy<true, PixelFormat::RGBA32UI>,
+    MortonCopy<true, PixelFormat::DXT1>,
+    MortonCopy<true, PixelFormat::DXT23>,
+    MortonCopy<true, PixelFormat::DXT45>,
+    MortonCopy<true, PixelFormat::DXN1>,
+    MortonCopy<true, PixelFormat::DXN2UNORM>,
+    MortonCopy<true, PixelFormat::DXN2SNORM>,
+    MortonCopy<true, PixelFormat::BC7U>,
+    MortonCopy<true, PixelFormat::BC6H_UF16>,
+    MortonCopy<true, PixelFormat::BC6H_SF16>,
+    MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
+    MortonCopy<true, PixelFormat::BGRA8>,
+    MortonCopy<true, PixelFormat::RGBA32F>,
+    MortonCopy<true, PixelFormat::RG32F>,
+    MortonCopy<true, PixelFormat::R32F>,
+    MortonCopy<true, PixelFormat::R16F>,
+    MortonCopy<true, PixelFormat::R16U>,
+    MortonCopy<true, PixelFormat::R16S>,
+    MortonCopy<true, PixelFormat::R16UI>,
+    MortonCopy<true, PixelFormat::R16I>,
+    MortonCopy<true, PixelFormat::RG16>,
+    MortonCopy<true, PixelFormat::RG16F>,
+    MortonCopy<true, PixelFormat::RG16UI>,
+    MortonCopy<true, PixelFormat::RG16I>,
+    MortonCopy<true, PixelFormat::RG16S>,
+    MortonCopy<true, PixelFormat::RGB32F>,
+    MortonCopy<true, PixelFormat::RGBA8_SRGB>,
+    MortonCopy<true, PixelFormat::RG8U>,
+    MortonCopy<true, PixelFormat::RG8S>,
+    MortonCopy<true, PixelFormat::RG32UI>,
+    MortonCopy<true, PixelFormat::R32UI>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X4>,
+    MortonCopy<true, PixelFormat::BGRA8_SRGB>,
+    MortonCopy<true, PixelFormat::DXT1_SRGB>,
+    MortonCopy<true, PixelFormat::DXT23_SRGB>,
+    MortonCopy<true, PixelFormat::DXT45_SRGB>,
+    MortonCopy<true, PixelFormat::BC7U_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X5>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>,
+    MortonCopy<true, PixelFormat::ASTC_2D_10X8>,
+    MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>,
+    MortonCopy<true, PixelFormat::Z32F>,
+    MortonCopy<true, PixelFormat::Z16>,
+    MortonCopy<true, PixelFormat::Z24S8>,
+    MortonCopy<true, PixelFormat::S8Z24>,
+    MortonCopy<true, PixelFormat::Z32FS8>,
 };
 
 static constexpr ConversionArray linear_to_morton_fns = {
-    // clang-format off
-        MortonCopy<false, PixelFormat::ABGR8U>,
-        MortonCopy<false, PixelFormat::ABGR8S>,
-        MortonCopy<false, PixelFormat::ABGR8UI>,
-        MortonCopy<false, PixelFormat::B5G6R5U>,
-        MortonCopy<false, PixelFormat::A2B10G10R10U>,
-        MortonCopy<false, PixelFormat::A1B5G5R5U>,
-        MortonCopy<false, PixelFormat::R8U>,
-        MortonCopy<false, PixelFormat::R8UI>,
-        MortonCopy<false, PixelFormat::RGBA16F>,
-        MortonCopy<false, PixelFormat::RGBA16U>,
-        MortonCopy<false, PixelFormat::RGBA16UI>,
-        MortonCopy<false, PixelFormat::R11FG11FB10F>,
-        MortonCopy<false, PixelFormat::RGBA32UI>,
-        MortonCopy<false, PixelFormat::DXT1>,
-        MortonCopy<false, PixelFormat::DXT23>,
-        MortonCopy<false, PixelFormat::DXT45>,
-        MortonCopy<false, PixelFormat::DXN1>,
-        MortonCopy<false, PixelFormat::DXN2UNORM>,
-        MortonCopy<false, PixelFormat::DXN2SNORM>,
-        MortonCopy<false, PixelFormat::BC7U>,
-        MortonCopy<false, PixelFormat::BC6H_UF16>,
-        MortonCopy<false, PixelFormat::BC6H_SF16>,
-        // TODO(Subv): Swizzling ASTC formats are not supported
-        nullptr,
-        MortonCopy<false, PixelFormat::BGRA8>,
-        MortonCopy<false, PixelFormat::RGBA32F>,
-        MortonCopy<false, PixelFormat::RG32F>,
-        MortonCopy<false, PixelFormat::R32F>,
-        MortonCopy<false, PixelFormat::R16F>,
-        MortonCopy<false, PixelFormat::R16U>,
-        MortonCopy<false, PixelFormat::R16S>,
-        MortonCopy<false, PixelFormat::R16UI>,
-        MortonCopy<false, PixelFormat::R16I>,
-        MortonCopy<false, PixelFormat::RG16>,
-        MortonCopy<false, PixelFormat::RG16F>,
-        MortonCopy<false, PixelFormat::RG16UI>,
-        MortonCopy<false, PixelFormat::RG16I>,
-        MortonCopy<false, PixelFormat::RG16S>,
-        MortonCopy<false, PixelFormat::RGB32F>,
-        MortonCopy<false, PixelFormat::RGBA8_SRGB>,
-        MortonCopy<false, PixelFormat::RG8U>,
-        MortonCopy<false, PixelFormat::RG8S>,
-        MortonCopy<false, PixelFormat::RG32UI>,
-        MortonCopy<false, PixelFormat::R32UI>,
-        nullptr,
-        nullptr,
-        nullptr,
-        MortonCopy<false, PixelFormat::BGRA8_SRGB>,
-        MortonCopy<false, PixelFormat::DXT1_SRGB>,
-        MortonCopy<false, PixelFormat::DXT23_SRGB>,
-        MortonCopy<false, PixelFormat::DXT45_SRGB>,
-        MortonCopy<false, PixelFormat::BC7U_SRGB>,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        MortonCopy<false, PixelFormat::Z32F>,
-        MortonCopy<false, PixelFormat::Z16>,
-        MortonCopy<false, PixelFormat::Z24S8>,
-        MortonCopy<false, PixelFormat::S8Z24>,
-        MortonCopy<false, PixelFormat::Z32FS8>,
-    // clang-format on
+    MortonCopy<false, PixelFormat::ABGR8U>,
+    MortonCopy<false, PixelFormat::ABGR8S>,
+    MortonCopy<false, PixelFormat::ABGR8UI>,
+    MortonCopy<false, PixelFormat::B5G6R5U>,
+    MortonCopy<false, PixelFormat::A2B10G10R10U>,
+    MortonCopy<false, PixelFormat::A1B5G5R5U>,
+    MortonCopy<false, PixelFormat::R8U>,
+    MortonCopy<false, PixelFormat::R8UI>,
+    MortonCopy<false, PixelFormat::RGBA16F>,
+    MortonCopy<false, PixelFormat::RGBA16U>,
+    MortonCopy<false, PixelFormat::RGBA16UI>,
+    MortonCopy<false, PixelFormat::R11FG11FB10F>,
+    MortonCopy<false, PixelFormat::RGBA32UI>,
+    MortonCopy<false, PixelFormat::DXT1>,
+    MortonCopy<false, PixelFormat::DXT23>,
+    MortonCopy<false, PixelFormat::DXT45>,
+    MortonCopy<false, PixelFormat::DXN1>,
+    MortonCopy<false, PixelFormat::DXN2UNORM>,
+    MortonCopy<false, PixelFormat::DXN2SNORM>,
+    MortonCopy<false, PixelFormat::BC7U>,
+    MortonCopy<false, PixelFormat::BC6H_UF16>,
+    MortonCopy<false, PixelFormat::BC6H_SF16>,
+    // TODO(Subv): Swizzling ASTC formats are not supported
+    nullptr,
+    MortonCopy<false, PixelFormat::BGRA8>,
+    MortonCopy<false, PixelFormat::RGBA32F>,
+    MortonCopy<false, PixelFormat::RG32F>,
+    MortonCopy<false, PixelFormat::R32F>,
+    MortonCopy<false, PixelFormat::R16F>,
+    MortonCopy<false, PixelFormat::R16U>,
+    MortonCopy<false, PixelFormat::R16S>,
+    MortonCopy<false, PixelFormat::R16UI>,
+    MortonCopy<false, PixelFormat::R16I>,
+    MortonCopy<false, PixelFormat::RG16>,
+    MortonCopy<false, PixelFormat::RG16F>,
+    MortonCopy<false, PixelFormat::RG16UI>,
+    MortonCopy<false, PixelFormat::RG16I>,
+    MortonCopy<false, PixelFormat::RG16S>,
+    MortonCopy<false, PixelFormat::RGB32F>,
+    MortonCopy<false, PixelFormat::RGBA8_SRGB>,
+    MortonCopy<false, PixelFormat::RG8U>,
+    MortonCopy<false, PixelFormat::RG8S>,
+    MortonCopy<false, PixelFormat::RG32UI>,
+    MortonCopy<false, PixelFormat::R32UI>,
+    nullptr,
+    nullptr,
+    nullptr,
+    MortonCopy<false, PixelFormat::BGRA8_SRGB>,
+    MortonCopy<false, PixelFormat::DXT1_SRGB>,
+    MortonCopy<false, PixelFormat::DXT23_SRGB>,
+    MortonCopy<false, PixelFormat::DXT45_SRGB>,
+    MortonCopy<false, PixelFormat::BC7U_SRGB>,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    MortonCopy<false, PixelFormat::Z32F>,
+    MortonCopy<false, PixelFormat::Z16>,
+    MortonCopy<false, PixelFormat::Z24S8>,
+    MortonCopy<false, PixelFormat::S8Z24>,
+    MortonCopy<false, PixelFormat::Z32FS8>,
 };
 
 static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) {
@@ -191,45 +187,6 @@ static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFor
     return morton_to_linear_fns[static_cast<std::size_t>(format)];
 }
 
-/// 8x8 Z-Order coordinate from 2D coordinates
-static u32 MortonInterleave(u32 x, u32 y) {
-    static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
-    static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
-    return xlut[x % 8] + ylut[y % 8];
-}
-
-/// Calculates the offset of the position of the pixel in Morton order
-static u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) {
-    // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
-    // of which is composed of four 2x2 subtiles each of which is composed of four texels.
-    // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
-    // texels are laid out in a 2x2 subtile like this:
-    // 2 3
-    // 0 1
-    //
-    // The full 8x8 tile has the texels arranged like this:
-    //
-    // 42 43 46 47 58 59 62 63
-    // 40 41 44 45 56 57 60 61
-    // 34 35 38 39 50 51 54 55
-    // 32 33 36 37 48 49 52 53
-    // 10 11 14 15 26 27 30 31
-    // 08 09 12 13 24 25 28 29
-    // 02 03 06 07 18 19 22 23
-    // 00 01 04 05 16 17 20 21
-    //
-    // This pattern is what's called Z-order curve, or Morton order.
-
-    const unsigned int block_height = 8;
-    const unsigned int coarse_x = x & ~7;
-
-    u32 i = MortonInterleave(x, y);
-
-    const unsigned int offset = coarse_x * block_height;
-
-    return (i + offset) * bytes_per_pixel;
-}
-
 static u32 MortonInterleave128(u32 x, u32 y) {
     // 128x128 Z-Order coordinate from 2D coordinates
     static constexpr u32 xlut[] = {
@@ -325,14 +282,14 @@ static u32 GetMortonOffset128(u32 x, u32 y, u32 bytes_per_pixel) {
 
 void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride,
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, std::size_t buffer_size, VAddr addr) {
-
+                   u8* buffer, VAddr addr) {
     GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth,
-                                     tile_width_spacing, buffer, buffer_size, addr);
+                                     tile_width_spacing, buffer, addr);
 }
 
-void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixel, u32 linear_bytes_per_pixel,
-                         u8* morton_data, u8* linear_data, bool morton_to_linear) {
+void MortonCopyPixels128(MortonSwizzleMode mode, u32 width, u32 height, u32 bytes_per_pixel,
+                         u32 linear_bytes_per_pixel, u8* morton_data, u8* linear_data) {
+    const bool morton_to_linear = mode == MortonSwizzleMode::MortonToLinear;
     u8* data_ptrs[2];
     for (u32 y = 0; y < height; ++y) {
         for (u32 x = 0; x < width; ++x) {
diff --git a/src/video_core/morton.h b/src/video_core/morton.h
index 065f59ce3..b565204b5 100644
--- a/src/video_core/morton.h
+++ b/src/video_core/morton.h
@@ -13,9 +13,9 @@ enum class MortonSwizzleMode { MortonToLinear, LinearToMorton };
 
 void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride,
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, std::size_t buffer_size, VAddr addr);
+                   u8* buffer, VAddr addr);
 
-void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixel, u32 linear_bytes_per_pixel,
-                         u8* morton_data, u8* linear_data, bool morton_to_linear);
+void MortonCopyPixels128(MortonSwizzleMode mode, u32 width, u32 height, u32 bytes_per_pixel,
+                         u32 linear_bytes_per_pixel, u8* morton_data, u8* linear_data);
 
 } // namespace VideoCore
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index a7bcf26fb..ecd9986a0 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <mutex>
 #include <set>
 #include <unordered_map>
 
@@ -12,14 +13,26 @@
 
 #include "common/common_types.h"
 #include "core/settings.h"
+#include "video_core/gpu.h"
 #include "video_core/rasterizer_interface.h"
 
 class RasterizerCacheObject {
 public:
+    explicit RasterizerCacheObject(const u8* host_ptr)
+        : host_ptr{host_ptr}, cache_addr{ToCacheAddr(host_ptr)} {}
+
     virtual ~RasterizerCacheObject();
 
+    CacheAddr GetCacheAddr() const {
+        return cache_addr;
+    }
+
+    const u8* GetHostPtr() const {
+        return host_ptr;
+    }
+
     /// Gets the address of the shader in guest memory, required for cache management
-    virtual VAddr GetAddr() const = 0;
+    virtual VAddr GetCpuAddr() const = 0;
 
     /// Gets the size of the shader in guest memory, required for cache management
     virtual std::size_t GetSizeInBytes() const = 0;
@@ -58,6 +71,8 @@ private:
     bool is_registered{};      ///< Whether the object is currently registered with the cache
     bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
     u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
+    CacheAddr cache_addr{};    ///< Cache address memory, unique from emulated virtual address space
+    const u8* host_ptr{};      ///< Pointer to the memory backing this cached region
 };
 
 template <class T>
@@ -68,7 +83,9 @@ public:
     explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
 
     /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(Tegra::GPUVAddr addr, size_t size) {
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         const auto& objects{GetSortedObjectsFromRegion(addr, size)};
         for (auto& object : objects) {
             FlushObject(object);
@@ -76,7 +93,9 @@ public:
     }
 
     /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
+    void InvalidateRegion(CacheAddr addr, u64 size) {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         const auto& objects{GetSortedObjectsFromRegion(addr, size)};
         for (auto& object : objects) {
             if (!object->IsRegistered()) {
@@ -89,48 +108,60 @@ public:
 
     /// Invalidates everything in the cache
     void InvalidateAll() {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         while (interval_cache.begin() != interval_cache.end()) {
             Unregister(*interval_cache.begin()->second.begin());
         }
     }
 
 protected:
-    /// Tries to get an object from the cache with the specified address
-    T TryGet(VAddr addr) const {
+    /// Tries to get an object from the cache with the specified cache address
+    T TryGet(CacheAddr addr) const {
         const auto iter = map_cache.find(addr);
         if (iter != map_cache.end())
             return iter->second;
         return nullptr;
     }
 
+    T TryGet(const void* addr) const {
+        const auto iter = map_cache.find(ToCacheAddr(addr));
+        if (iter != map_cache.end())
+            return iter->second;
+        return nullptr;
+    }
+
     /// Register an object into the cache
     void Register(const T& object) {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         object->SetIsRegistered(true);
         interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), 1);
+        map_cache.insert({object->GetCacheAddr(), object});
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
     }
 
     /// Unregisters an object from the cache
     void Unregister(const T& object) {
-        object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1);
-        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
-        if (Settings::values.use_accurate_gpu_emulation) {
-            FlushObject(object);
-        }
+        std::lock_guard<std::recursive_mutex> lock{mutex};
 
+        object->SetIsRegistered(false);
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
         interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(object->GetAddr());
+        map_cache.erase(object->GetCacheAddr());
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
     u64 GetModifiedTicks() {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         return ++modified_ticks;
     }
 
     /// Flushes the specified object, updating appropriate cache state as needed
     void FlushObject(const T& object) {
+        std::lock_guard<std::recursive_mutex> lock{mutex};
+
         if (!object->IsDirty()) {
             return;
         }
@@ -140,7 +171,7 @@ protected:
 
 private:
     /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
+    std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) {
         if (size == 0) {
             return {};
         }
@@ -164,17 +195,18 @@ private:
     }
 
     using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
+    using ObjectCache = std::unordered_map<CacheAddr, T>;
+    using IntervalCache = boost::icl::interval_map<CacheAddr, ObjectSet>;
     using ObjectInterval = typename IntervalCache::interval_type;
 
     static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetAddr(),
-                                          object->GetAddr() + object->GetSizeInBytes());
+        return ObjectInterval::right_open(object->GetCacheAddr(),
+                                          object->GetCacheAddr() + object->GetSizeInBytes());
     }
 
     ObjectCache map_cache;
     IntervalCache interval_cache; ///< Cache of objects
     u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
     VideoCore::RasterizerInterface& rasterizer;
+    std::recursive_mutex mutex;
 };
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6a1dc9cf6..76e292e87 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -35,14 +35,14 @@ public:
     virtual void FlushAll() = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     /// and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Attempt to use a faster method to perform a surface copy
     virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@@ -63,7 +63,7 @@ public:
     }
 
     /// Increase/decrease the number of object in pages touching the specified region
-    virtual void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {}
+    virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
 
     /// Initialize disk cached resources for the game being emulated
     virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index b3062e5ba..a4eea61a6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -13,6 +13,11 @@
 
 namespace OpenGL {
 
+CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
+                                     std::size_t alignment, u8* host_ptr)
+    : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{
+                                                                                host_ptr} {}
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
     : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
 
@@ -26,11 +31,12 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
     // TODO: Figure out which size is the best for given games.
     cache &= size >= 2048;
 
+    const auto& host_ptr{Memory::GetPointer(*cpu_addr)};
     if (cache) {
-        auto entry = TryGet(*cpu_addr);
+        auto entry = TryGet(host_ptr);
         if (entry) {
-            if (entry->size >= size && entry->alignment == alignment) {
-                return entry->offset;
+            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
+                return entry->GetOffset();
             }
             Unregister(entry);
         }
@@ -39,17 +45,17 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
     AlignBuffer(alignment);
     const GLintptr uploaded_offset = buffer_offset;
 
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+    if (!host_ptr) {
+        return uploaded_offset;
+    }
 
+    std::memcpy(buffer_ptr, host_ptr, size);
     buffer_ptr += size;
     buffer_offset += size;
 
     if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>();
-        entry->offset = uploaded_offset;
-        entry->size = size;
-        entry->alignment = alignment;
-        entry->addr = *cpu_addr;
+        auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset,
+                                                         alignment, host_ptr);
         Register(entry);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index c11acfb79..1de1f84ae 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -17,22 +17,39 @@ namespace OpenGL {
 
 class RasterizerOpenGL;
 
-struct CachedBufferEntry final : public RasterizerCacheObject {
-    VAddr GetAddr() const override {
-        return addr;
+class CachedBufferEntry final : public RasterizerCacheObject {
+public:
+    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
+                               std::size_t alignment, u8* host_ptr);
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
         return size;
     }
 
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    GLintptr GetOffset() const {
+        return offset;
+    }
+
+    std::size_t GetAlignment() const {
+        return alignment;
+    }
+
     // We do not have to flush this cache as things in it are never modified by us.
     void Flush() override {}
 
-    VAddr addr;
-    std::size_t size;
-    GLintptr offset;
-    std::size_t alignment;
+private:
+    VAddr cpu_addr{};
+    std::size_t size{};
+    GLintptr offset{};
+    std::size_t alignment{};
 };
 
 class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index c7f32feaa..a2c509c24 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -15,12 +15,13 @@
 
 namespace OpenGL {
 
-CachedGlobalRegion::CachedGlobalRegion(VAddr addr, u32 size) : addr{addr}, size{size} {
+CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
+    : cpu_addr{cpu_addr}, size{size}, RasterizerCacheObject{host_ptr} {
     buffer.Create();
     // Bind and unbind the buffer so it gets allocated by the driver
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
-    LabelGLObject(GL_BUFFER, buffer.handle, addr, "GlobalMemory");
+    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
 }
 
 void CachedGlobalRegion::Reload(u32 size_) {
@@ -35,7 +36,7 @@ void CachedGlobalRegion::Reload(u32 size_) {
 
     // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
-    glBufferData(GL_SHADER_STORAGE_BUFFER, size, Memory::GetPointer(addr), GL_DYNAMIC_DRAW);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
 }
 
 GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const {
@@ -46,19 +47,19 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32
     return search->second;
 }
 
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size) {
+GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr) {
     GlobalRegion region{TryGetReservedGlobalRegion(addr, size)};
     if (!region) {
         // No reserved surface available, create a new one and reserve it
-        region = std::make_shared<CachedGlobalRegion>(addr, size);
+        region = std::make_shared<CachedGlobalRegion>(addr, size, host_ptr);
         ReserveGlobalRegion(region);
     }
     region->Reload(size);
     return region;
 }
 
-void GlobalRegionCacheOpenGL::ReserveGlobalRegion(const GlobalRegion& region) {
-    reserve[region->GetAddr()] = region;
+void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
+    reserve.insert_or_assign(region->GetCpuAddr(), std::move(region));
 }
 
 GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
@@ -80,11 +81,12 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
     ASSERT(actual_addr);
 
     // Look up global region in the cache based on address
-    GlobalRegion region = TryGet(*actual_addr);
+    const auto& host_ptr{Memory::GetPointer(*actual_addr)};
+    GlobalRegion region{TryGet(host_ptr)};
 
     if (!region) {
         // No global region found - create a new one
-        region = GetUncachedGlobalRegion(*actual_addr, size);
+        region = GetUncachedGlobalRegion(*actual_addr, size, host_ptr);
         Register(region);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index 37830bb7c..e497a0619 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -27,15 +27,13 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
 
 class CachedGlobalRegion final : public RasterizerCacheObject {
 public:
-    explicit CachedGlobalRegion(VAddr addr, u32 size);
+    explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
 
-    /// Gets the address of the shader in guest memory, required for cache management
-    VAddr GetAddr() const {
-        return addr;
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
-    /// Gets the size of the shader in guest memory, required for cache management
-    std::size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const override {
         return size;
     }
 
@@ -53,9 +51,8 @@ public:
     }
 
 private:
-    VAddr addr{};
+    VAddr cpu_addr{};
     u32 size{};
-
     OGLBuffer buffer;
 };
 
@@ -69,8 +66,8 @@ public:
 
 private:
     GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size);
-    void ReserveGlobalRegion(const GlobalRegion& region);
+    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr);
+    void ReserveGlobalRegion(GlobalRegion region);
 
     std::unordered_map<VAddr, GlobalRegion> reserve;
 };
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 824863561..bb6de5477 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -102,8 +102,9 @@ struct FramebufferCacheKey {
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system,
                                    ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, emu_window{window},
-      screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
+    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this},
+      emu_window{window}, system{system}, screen_info{info},
+      buffer_cache(*this, STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -138,7 +139,7 @@ void RasterizerOpenGL::CheckExtensions() {
 }
 
 GLuint RasterizerOpenGL::SetupVertexFormat() {
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     if (!gpu.dirty_flags.vertex_attrib_format) {
@@ -207,7 +208,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
 }
 
 void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     if (gpu.dirty_flags.vertex_array.none())
@@ -248,7 +249,7 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
 }
 
 DrawParameters RasterizerOpenGL::SetupDraw() {
-    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
 
@@ -297,7 +298,7 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     MICROPROFILE_SCOPE(OpenGL_Shader);
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = system.GPU().Maxwell3D();
 
     BaseBindings base_bindings;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -413,7 +414,7 @@ void RasterizerOpenGL::SetupCachedFramebuffer(const FramebufferCacheKey& fbkey,
 }
 
 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -431,7 +432,7 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
 }
 
 std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     return static_cast<std::size_t>(regs.index_array.count) *
            static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
@@ -448,7 +449,7 @@ static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
     return boost::make_iterator_range(map.equal_range(interval));
 }
 
-void RasterizerOpenGL::UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {
+void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
     const u64 page_start{addr >> Memory::PAGE_BITS};
     const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
 
@@ -487,7 +488,7 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
     OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool preserve_contents,
     std::optional<std::size_t> single_color_target) {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
@@ -581,7 +582,7 @@ void RasterizerOpenGL::Clear() {
     const auto prev_state{state};
     SCOPE_EXIT({ prev_state.Apply(); });
 
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     bool use_color{};
     bool use_depth{};
     bool use_stencil{};
@@ -672,7 +673,7 @@ void RasterizerOpenGL::DrawArrays() {
         return;
 
     MICROPROFILE_SCOPE(OpenGL_Drawing);
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
     ConfigureFramebuffers(state);
@@ -746,12 +747,12 @@ void RasterizerOpenGL::DrawArrays() {
 
 void RasterizerOpenGL::FlushAll() {}
 
-void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     res_cache.FlushRegion(addr, size);
 }
 
-void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     res_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
@@ -759,7 +760,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
     buffer_cache.InvalidateRegion(addr, size);
 }
 
-void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     FlushRegion(addr, size);
     InvalidateRegion(addr, size);
 }
@@ -781,7 +782,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
 
-    const auto& surface{res_cache.TryFindFramebufferSurface(framebuffer_addr)};
+    const auto& surface{res_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))};
     if (!surface) {
         return {};
     }
@@ -892,7 +893,7 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
                                          const Shader& shader, GLuint program_handle,
                                          BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& gpu = Core::System::GetInstance().GPU();
+    const auto& gpu = system.GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
     const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& entries = shader->GetShaderEntries().const_buffers;
@@ -971,7 +972,7 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
 void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
                                      GLuint program_handle, BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
-    const auto& gpu = Core::System::GetInstance().GPU();
+    const auto& gpu = system.GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
     const auto& entries = shader->GetShaderEntries().samplers;
 
@@ -998,7 +999,7 @@ void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& s
 }
 
 void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     const bool geometry_shaders_enabled =
         regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry));
     const std::size_t viewport_count =
@@ -1021,7 +1022,7 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
 void RasterizerOpenGL::SyncClipEnabled(
     const std::array<bool, Maxwell::Regs::NumClipDistances>& clip_mask) {
 
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     const std::array<bool, Maxwell::Regs::NumClipDistances> reg_state{
         regs.clip_distance_enabled.c0 != 0, regs.clip_distance_enabled.c1 != 0,
         regs.clip_distance_enabled.c2 != 0, regs.clip_distance_enabled.c3 != 0,
@@ -1038,7 +1039,7 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     state.cull.enabled = regs.cull.enabled != 0;
 
@@ -1062,14 +1063,14 @@ void RasterizerOpenGL::SyncCullMode() {
 }
 
 void RasterizerOpenGL::SyncPrimitiveRestart() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     state.primitive_restart.enabled = regs.primitive_restart.enabled;
     state.primitive_restart.index = regs.primitive_restart.index;
 }
 
 void RasterizerOpenGL::SyncDepthTestState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     state.depth.test_enabled = regs.depth_test_enable != 0;
     state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;
@@ -1081,7 +1082,7 @@ void RasterizerOpenGL::SyncDepthTestState() {
 }
 
 void RasterizerOpenGL::SyncStencilTestState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     state.stencil.test_enabled = regs.stencil_enable != 0;
 
     if (!regs.stencil_enable) {
@@ -1115,7 +1116,7 @@ void RasterizerOpenGL::SyncStencilTestState() {
 }
 
 void RasterizerOpenGL::SyncColorMask() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t count =
         regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
     for (std::size_t i = 0; i < count; i++) {
@@ -1129,18 +1130,18 @@ void RasterizerOpenGL::SyncColorMask() {
 }
 
 void RasterizerOpenGL::SyncMultiSampleState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     state.multisample_control.alpha_to_coverage = regs.multisample_control.alpha_to_coverage != 0;
     state.multisample_control.alpha_to_one = regs.multisample_control.alpha_to_one != 0;
 }
 
 void RasterizerOpenGL::SyncFragmentColorClampState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     state.fragment_color_clamp.enabled = regs.frag_color_clamp != 0;
 }
 
 void RasterizerOpenGL::SyncBlendState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     state.blend_color.red = regs.blend_color.r;
     state.blend_color.green = regs.blend_color.g;
@@ -1182,7 +1183,7 @@ void RasterizerOpenGL::SyncBlendState() {
 }
 
 void RasterizerOpenGL::SyncLogicOpState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
 
     state.logic_op.enabled = regs.logic_op.enable != 0;
 
@@ -1196,7 +1197,7 @@ void RasterizerOpenGL::SyncLogicOpState() {
 }
 
 void RasterizerOpenGL::SyncScissorTest(OpenGLState& current_state) {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     const bool geometry_shaders_enabled =
         regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry));
     const std::size_t viewport_count =
@@ -1218,17 +1219,17 @@ void RasterizerOpenGL::SyncScissorTest(OpenGLState& current_state) {
 }
 
 void RasterizerOpenGL::SyncTransformFeedback() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented");
 }
 
 void RasterizerOpenGL::SyncPointState() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     state.point.size = regs.point_size;
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
     state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
     state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
@@ -1238,7 +1239,7 @@ void RasterizerOpenGL::SyncPolygonOffset() {
 }
 
 void RasterizerOpenGL::CheckAlphaTests() {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;
     UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1,
                          "Alpha Testing is enabled with more than one rendertarget");
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 7e63f8008..30f3e8acb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -57,9 +57,9 @@ public:
     void DrawArrays() override;
     void Clear() override;
     void FlushAll() override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                const Common::Rectangle<u32>& src_rect,
@@ -67,7 +67,7 @@ public:
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
     bool AccelerateDrawBatch(bool is_indexed) override;
-    void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) override;
+    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
@@ -215,6 +215,7 @@ private:
     GlobalRegionCacheOpenGL global_cache;
 
     Core::Frontend::EmuWindow& emu_window;
+    Core::System& system;
 
     ScreenInfo& screen_info;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index e9eb6e921..451de00e8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -61,6 +61,7 @@ void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) {
 
     addr = cpu_addr ? *cpu_addr : 0;
     gpu_addr = gpu_addr_;
+    host_ptr = Memory::GetPointer(addr);
     size_in_bytes = SizeInBytesRaw();
 
     if (IsPixelFormatASTC(pixel_format)) {
@@ -446,7 +447,7 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
             MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level),
                           params.MipBlockHeight(mip_level), params.MipHeight(mip_level),
                           params.MipBlockDepth(mip_level), 1, params.tile_width_spacing,
-                          gl_buffer.data() + offset_gl, gl_size, params.addr + offset);
+                          gl_buffer.data() + offset_gl, params.addr + offset);
             offset += layer_size;
             offset_gl += gl_size;
         }
@@ -455,7 +456,7 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
         MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level),
                       params.MipBlockHeight(mip_level), params.MipHeight(mip_level),
                       params.MipBlockDepth(mip_level), depth, params.tile_width_spacing,
-                      gl_buffer.data(), gl_buffer.size(), params.addr + offset);
+                      gl_buffer.data(), params.addr + offset);
     }
 }
 
@@ -563,8 +564,8 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac
 }
 
 CachedSurface::CachedSurface(const SurfaceParams& params)
-    : params(params), gl_target(SurfaceTargetToGL(params.target)),
-      cached_size_in_bytes(params.size_in_bytes) {
+    : params{params}, gl_target{SurfaceTargetToGL(params.target)},
+      cached_size_in_bytes{params.size_in_bytes}, RasterizerCacheObject{params.host_ptr} {
     texture.Create(gl_target);
 
     // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0)
@@ -633,10 +634,9 @@ void CachedSurface::LoadGLBuffer() {
         const u32 bpp = params.GetFormatBpp() / 8;
         const u32 copy_size = params.width * bpp;
         if (params.pitch == copy_size) {
-            std::memcpy(gl_buffer[0].data(), Memory::GetPointer(params.addr),
-                        params.size_in_bytes_gl);
+            std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
         } else {
-            const u8* start = Memory::GetPointer(params.addr);
+            const u8* start{params.host_ptr};
             u8* write_to = gl_buffer[0].data();
             for (u32 h = params.height; h > 0; h--) {
                 std::memcpy(write_to, start, copy_size);
@@ -680,8 +680,6 @@ void CachedSurface::FlushGLBuffer() {
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
     Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,
                                            params.height, params.depth, true, true);
-    const u8* const texture_src_data = Memory::GetPointer(params.addr);
-    ASSERT(texture_src_data);
     if (params.is_tiled) {
         ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",
                    params.block_width, static_cast<u32>(params.target));
@@ -691,9 +689,9 @@ void CachedSurface::FlushGLBuffer() {
         const u32 bpp = params.GetFormatBpp() / 8;
         const u32 copy_size = params.width * bpp;
         if (params.pitch == copy_size) {
-            std::memcpy(Memory::GetPointer(params.addr), gl_buffer[0].data(), GetSizeInBytes());
+            std::memcpy(params.host_ptr, gl_buffer[0].data(), GetSizeInBytes());
         } else {
-            u8* start = Memory::GetPointer(params.addr);
+            u8* start{params.host_ptr};
             const u8* read_to = gl_buffer[0].data();
             for (u32 h = params.height; h > 0; h--) {
                 std::memcpy(start, read_to, copy_size);
@@ -932,7 +930,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
     }
 
     // Look up surface in the cache based on address
-    Surface surface{TryGet(params.addr)};
+    Surface surface{TryGet(params.host_ptr)};
     if (surface) {
         if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
             // Use the cached surface as-is unless it's not synced with memory
@@ -986,7 +984,7 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
     for (u32 layer = 0; layer < dst_params.depth; layer++) {
         for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) {
             const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap);
-            const Surface& copy = TryGet(sub_address);
+            const Surface& copy = TryGet(Memory::GetPointer(sub_address));
             if (!copy)
                 continue;
             const auto& src_params{copy->GetSurfaceParams()};
@@ -1163,7 +1161,8 @@ void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface,
     const auto& dst_params{dst_surface->GetSurfaceParams()};
 
     // Flush enough memory for both the source and destination surface
-    FlushRegion(src_params.addr, std::max(src_params.MemorySize(), dst_params.MemorySize()));
+    FlushRegion(ToCacheAddr(src_params.host_ptr),
+                std::max(src_params.MemorySize(), dst_params.MemorySize()));
 
     LoadSurface(dst_surface);
 }
@@ -1215,8 +1214,8 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
     return new_surface;
 }
 
-Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr addr) const {
-    return TryGet(addr);
+Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(const u8* host_ptr) const {
+    return TryGet(host_ptr);
 }
 
 void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) {
@@ -1267,7 +1266,7 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
             src_params.height == dst_params.MipHeight(*level) &&
             src_params.block_height >= dst_params.MipBlockHeight(*level)) {
             const std::optional<u32> slot =
-                TryFindBestLayer(render_surface->GetAddr(), dst_params, *level);
+                TryFindBestLayer(render_surface->GetCpuAddr(), dst_params, *level);
             if (slot.has_value()) {
                 glCopyImageSubData(render_surface->Texture().handle,
                                    SurfaceTargetToGL(src_params.target), 0, 0, 0, 0,
@@ -1283,8 +1282,8 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
 }
 
 static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) {
-    const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize();
-    const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize();
+    const VAddr bound1 = blitted_surface->GetCpuAddr() + blitted_surface->GetMemorySize();
+    const VAddr bound2 = render_surface->GetCpuAddr() + render_surface->GetMemorySize();
     if (bound2 > bound1)
         return true;
     const auto& dst_params = blitted_surface->GetSurfaceParams();
@@ -1327,7 +1326,8 @@ void RasterizerCacheOpenGL::SignalPreDrawCall() {
 void RasterizerCacheOpenGL::SignalPostDrawCall() {
     for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {
         if (current_color_buffers[i] != nullptr) {
-            Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr());
+            Surface intersect =
+                CollideOnReinterpretedSurface(current_color_buffers[i]->GetCacheAddr());
             if (intersect != nullptr) {
                 PartialReinterpretSurface(current_color_buffers[i], intersect);
                 texception = true;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 9cf6f50be..b3afad139 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -297,6 +297,7 @@ struct SurfaceParams {
     bool srgb_conversion;
     // Parameters used for caching
     VAddr addr;
+    u8* host_ptr;
     Tegra::GPUVAddr gpu_addr;
     std::size_t size_in_bytes;
     std::size_t size_in_bytes_gl;
@@ -345,9 +346,9 @@ class RasterizerOpenGL;
 
 class CachedSurface final : public RasterizerCacheObject {
 public:
-    CachedSurface(const SurfaceParams& params);
+    explicit CachedSurface(const SurfaceParams& params);
 
-    VAddr GetAddr() const override {
+    VAddr GetCpuAddr() const override {
         return params.addr;
     }
 
@@ -449,7 +450,7 @@ public:
     Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);
 
     /// Tries to find a framebuffer using on the provided CPU address
-    Surface TryFindFramebufferSurface(VAddr addr) const;
+    Surface TryFindFramebufferSurface(const u8* host_ptr) const;
 
     /// Copies the contents of one surface to another
     void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
@@ -506,12 +507,12 @@ private:
     std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
     Surface last_depth_buffer;
 
-    using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>;
+    using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;
     using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
 
     static auto GetReinterpretInterval(const Surface& object) {
-        return SurfaceInterval::right_open(object->GetAddr() + 1,
-                                           object->GetAddr() + object->GetMemorySize() - 1);
+        return SurfaceInterval::right_open(object->GetCacheAddr() + 1,
+                                           object->GetCacheAddr() + object->GetMemorySize() - 1);
     }
 
     // Reinterpreted surfaces are very fragil as the game may keep rendering into them.
@@ -523,7 +524,7 @@ private:
         reinterpret_surface->MarkReinterpreted();
     }
 
-    Surface CollideOnReinterpretedSurface(VAddr addr) const {
+    Surface CollideOnReinterpretedSurface(CacheAddr addr) const {
         const SurfaceInterval interval{addr};
         for (auto& pair :
              boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4883e4f62..60a04e146 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -42,9 +42,9 @@ VAddr GetShaderAddress(Maxwell::ShaderProgram program) {
 }
 
 /// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(VAddr addr) {
+ProgramCode GetShaderCode(const u8* host_ptr) {
     ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
-    Memory::ReadBlock(addr, program_code.data(), program_code.size() * sizeof(u64));
+    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
     return program_code;
 }
 
@@ -214,12 +214,13 @@ std::set<GLenum> GetSupportedFormats() {
 
 } // namespace
 
-CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                           ShaderDiskCacheOpenGL& disk_cache,
+CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier,
+                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
-                           ProgramCode&& program_code, ProgramCode&& program_code_b)
-    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type},
-      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} {
+                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
+    : host_ptr{host_ptr}, guest_addr{guest_addr}, unique_identifier{unique_identifier},
+      program_type{program_type}, disk_cache{disk_cache},
+      precompiled_programs{precompiled_programs}, RasterizerCacheObject{host_ptr} {
 
     const std::size_t code_size = CalculateProgramSize(program_code);
     const std::size_t code_size_b =
@@ -243,12 +244,13 @@ CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderPro
     disk_cache.SaveRaw(raw);
 }
 
-CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                           ShaderDiskCacheOpenGL& disk_cache,
+CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier,
+                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
-                           GLShader::ProgramResult result)
-    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type},
-      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} {
+                           GLShader::ProgramResult result, u8* host_ptr)
+    : guest_addr{guest_addr}, unique_identifier{unique_identifier}, program_type{program_type},
+      disk_cache{disk_cache}, precompiled_programs{precompiled_programs}, RasterizerCacheObject{
+                                                                              host_ptr} {
 
     code = std::move(result.first);
     entries = result.second;
@@ -271,7 +273,7 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive
                 disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
             }
 
-            LabelGLObject(GL_PROGRAM, program->handle, addr);
+            LabelGLObject(GL_PROGRAM, program->handle, guest_addr);
         }
 
         handle = program->handle;
@@ -323,7 +325,7 @@ GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBind
         disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
     }
 
-    LabelGLObject(GL_PROGRAM, target_program->handle, addr, debug_name);
+    LabelGLObject(GL_PROGRAM, target_program->handle, guest_addr, debug_name);
 
     return target_program->handle;
 };
@@ -489,14 +491,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     const VAddr program_addr{GetShaderAddress(program)};
 
     // Look up shader in the cache based on address
-    Shader shader{TryGet(program_addr)};
+    const auto& host_ptr{Memory::GetPointer(program_addr)};
+    Shader shader{TryGet(host_ptr)};
 
     if (!shader) {
         // No shader found - create a new one
-        ProgramCode program_code = GetShaderCode(program_addr);
+        const auto& host_ptr{Memory::GetPointer(program_addr)};
+        ProgramCode program_code{GetShaderCode(host_ptr)};
         ProgramCode program_code_b;
         if (program == Maxwell::ShaderProgram::VertexA) {
-            program_code_b = GetShaderCode(GetShaderAddress(Maxwell::ShaderProgram::VertexB));
+            program_code_b = GetShaderCode(
+                Memory::GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
         }
         const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
 
@@ -504,11 +509,11 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         if (found != precompiled_shaders.end()) {
             shader =
                 std::make_shared<CachedShader>(program_addr, unique_identifier, program, disk_cache,
-                                               precompiled_programs, found->second);
+                                               precompiled_programs, found->second, host_ptr);
         } else {
             shader = std::make_shared<CachedShader>(
                 program_addr, unique_identifier, program, disk_cache, precompiled_programs,
-                std::move(program_code), std::move(program_code_b));
+                std::move(program_code), std::move(program_code_b), host_ptr);
         }
         Register(shader);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 97eed192f..81fe716b4 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -39,18 +39,18 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
 
 class CachedShader final : public RasterizerCacheObject {
 public:
-    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                          ShaderDiskCacheOpenGL& disk_cache,
+    explicit CachedShader(VAddr guest_addr, u64 unique_identifier,
+                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
-                          ProgramCode&& program_code, ProgramCode&& program_code_b);
+                          ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
 
-    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                          ShaderDiskCacheOpenGL& disk_cache,
+    explicit CachedShader(VAddr guest_addr, u64 unique_identifier,
+                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
-                          GLShader::ProgramResult result);
+                          GLShader::ProgramResult result, u8* host_ptr);
 
-    VAddr GetAddr() const override {
-        return addr;
+    VAddr GetCpuAddr() const override {
+        return guest_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
@@ -91,7 +91,8 @@ private:
 
     ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const;
 
-    VAddr addr{};
+    u8* host_ptr{};
+    VAddr guest_addr{};
     u64 unique_identifier{};
     Maxwell::ShaderProgram program_type{};
     ShaderDiskCacheOpenGL& disk_cache;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 8b510b6ae..b97576309 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -167,9 +167,11 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
         Memory::RasterizerFlushVirtualRegion(framebuffer_addr, size_in_bytes,
                                              Memory::FlushMode::Flush);
 
-        VideoCore::MortonCopyPixels128(framebuffer.width, framebuffer.height, bytes_per_pixel, 4,
-                                       Memory::GetPointer(framebuffer_addr),
-                                       gl_framebuffer_data.data(), true);
+        constexpr u32 linear_bpp = 4;
+        VideoCore::MortonCopyPixels128(VideoCore::MortonSwizzleMode::MortonToLinear,
+                                       framebuffer.width, framebuffer.height, bytes_per_pixel,
+                                       linear_bpp, Memory::GetPointer(framebuffer_addr),
+                                       gl_framebuffer_data.data());
 
         glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride));
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 4a33a6c84..95eab3fec 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -17,6 +17,11 @@
 
 namespace Vulkan {
 
+CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset,
+                                     std::size_t alignment, u8* host_ptr)
+    : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{
+                                                                                host_ptr} {}
+
 VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager,
                              VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
                              VKMemoryManager& memory_manager, VKScheduler& scheduler, u64 size)
@@ -37,16 +42,18 @@ VKBufferCache::~VKBufferCache() = default;
 u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment,
                                 bool cache) {
     const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)};
-    ASSERT(cpu_addr);
+    ASSERT_MSG(cpu_addr, "Invalid GPU address");
 
     // Cache management is a big overhead, so only cache entries with a given size.
     // TODO: Figure out which size is the best for given games.
     cache &= size >= 2048;
 
+    const auto& host_ptr{Memory::GetPointer(*cpu_addr)};
     if (cache) {
-        if (auto entry = TryGet(*cpu_addr); entry) {
-            if (entry->size >= size && entry->alignment == alignment) {
-                return entry->offset;
+        auto entry = TryGet(host_ptr);
+        if (entry) {
+            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
+                return entry->GetOffset();
             }
             Unregister(entry);
         }
@@ -55,17 +62,17 @@ u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64
     AlignBuffer(alignment);
     const u64 uploaded_offset = buffer_offset;
 
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+    if (!host_ptr) {
+        return uploaded_offset;
+    }
 
+    std::memcpy(buffer_ptr, host_ptr, size);
     buffer_ptr += size;
     buffer_offset += size;
 
     if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>();
-        entry->offset = uploaded_offset;
-        entry->size = size;
-        entry->alignment = alignment;
-        entry->addr = *cpu_addr;
+        auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset,
+                                                         alignment, host_ptr);
         Register(entry);
     }
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index d8e916f31..8b415744b 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -24,22 +24,39 @@ class VKFence;
 class VKMemoryManager;
 class VKStreamBuffer;
 
-struct CachedBufferEntry final : public RasterizerCacheObject {
-    VAddr GetAddr() const override {
-        return addr;
+class CachedBufferEntry final : public RasterizerCacheObject {
+public:
+    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, std::size_t alignment,
+                               u8* host_ptr);
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
         return size;
     }
 
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    u64 GetOffset() const {
+        return offset;
+    }
+
+    std::size_t GetAlignment() const {
+        return alignment;
+    }
+
     // We do not have to flush this cache as things in it are never modified by us.
     void Flush() override {}
 
-    VAddr addr;
-    std::size_t size;
-    u64 offset;
-    std::size_t alignment;
+private:
+    VAddr cpu_addr{};
+    std::size_t size{};
+    u64 offset{};
+    std::size_t alignment{};
 };
 
 class VKBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
new file mode 100644
index 000000000..ed3178f09
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -0,0 +1,81 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <optional>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "common/cityhash.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/maxwell_to_vk.h"
+#include "video_core/renderer_vulkan/vk_sampler_cache.h"
+#include "video_core/textures/texture.h"
+
+namespace Vulkan {
+
+static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4> color) {
+    // TODO(Rodrigo): Manage integer border colors
+    if (color == std::array<float, 4>{0, 0, 0, 0}) {
+        return vk::BorderColor::eFloatTransparentBlack;
+    } else if (color == std::array<float, 4>{0, 0, 0, 1}) {
+        return vk::BorderColor::eFloatOpaqueBlack;
+    } else if (color == std::array<float, 4>{1, 1, 1, 1}) {
+        return vk::BorderColor::eFloatOpaqueWhite;
+    } else {
+        return {};
+    }
+}
+
+std::size_t SamplerCacheKey::Hash() const {
+    static_assert(sizeof(raw) % sizeof(u64) == 0);
+    return static_cast<std::size_t>(
+        Common::CityHash64(reinterpret_cast<const char*>(raw.data()), sizeof(raw) / sizeof(u64)));
+}
+
+bool SamplerCacheKey::operator==(const SamplerCacheKey& rhs) const {
+    return raw == rhs.raw;
+}
+
+VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {}
+
+VKSamplerCache::~VKSamplerCache() = default;
+
+vk::Sampler VKSamplerCache::GetSampler(const Tegra::Texture::TSCEntry& tsc) {
+    const auto [entry, is_cache_miss] = cache.try_emplace(SamplerCacheKey{tsc});
+    auto& sampler = entry->second;
+    if (is_cache_miss) {
+        sampler = CreateSampler(tsc);
+    }
+    return *sampler;
+}
+
+UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) {
+    const float max_anisotropy = tsc.GetMaxAnisotropy();
+    const bool has_anisotropy = max_anisotropy > 1.0f;
+
+    const auto border_color = tsc.GetBorderColor();
+    const auto vk_border_color = TryConvertBorderColor(border_color);
+    UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}",
+                         border_color[0], border_color[1], border_color[2], border_color[3]);
+
+    constexpr bool unnormalized_coords = false;
+
+    const vk::SamplerCreateInfo sampler_ci(
+        {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter),
+        MaxwellToVK::Sampler::Filter(tsc.min_filter),
+        MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter),
+        MaxwellToVK::Sampler::WrapMode(tsc.wrap_u), MaxwellToVK::Sampler::WrapMode(tsc.wrap_v),
+        MaxwellToVK::Sampler::WrapMode(tsc.wrap_p), tsc.GetLodBias(), has_anisotropy,
+        max_anisotropy, tsc.depth_compare_enabled,
+        MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(),
+        tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack),
+        unnormalized_coords);
+
+    const auto& dld = device.GetDispatchLoader();
+    const auto dev = device.GetLogical();
+    return dev.createSamplerUnique(sampler_ci, nullptr, dld);
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.h b/src/video_core/renderer_vulkan/vk_sampler_cache.h
new file mode 100644
index 000000000..c6394dc87
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.h
@@ -0,0 +1,56 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "common/common_types.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/textures/texture.h"
+
+namespace Vulkan {
+
+class VKDevice;
+
+struct SamplerCacheKey final : public Tegra::Texture::TSCEntry {
+    std::size_t Hash() const;
+
+    bool operator==(const SamplerCacheKey& rhs) const;
+
+    bool operator!=(const SamplerCacheKey& rhs) const {
+        return !operator==(rhs);
+    }
+};
+
+} // namespace Vulkan
+
+namespace std {
+
+template <>
+struct hash<Vulkan::SamplerCacheKey> {
+    std::size_t operator()(const Vulkan::SamplerCacheKey& k) const noexcept {
+        return k.Hash();
+    }
+};
+
+} // namespace std
+
+namespace Vulkan {
+
+class VKSamplerCache {
+public:
+    explicit VKSamplerCache(const VKDevice& device);
+    ~VKSamplerCache();
+
+    vk::Sampler GetSampler(const Tegra::Texture::TSCEntry& tsc);
+
+private:
+    UniqueSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc);
+
+    const VKDevice& device;
+    std::unordered_map<SamplerCacheKey, UniqueSampler> cache;
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 8c278c0e2..93ecc6e31 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -283,31 +283,36 @@ enum class TextureMipmapFilter : u32 {
 
 struct TSCEntry {
     union {
-        BitField<0, 3, WrapMode> wrap_u;
-        BitField<3, 3, WrapMode> wrap_v;
-        BitField<6, 3, WrapMode> wrap_p;
-        BitField<9, 1, u32> depth_compare_enabled;
-        BitField<10, 3, DepthCompareFunc> depth_compare_func;
-        BitField<13, 1, u32> srgb_conversion;
-        BitField<20, 3, u32> max_anisotropy;
+        struct {
+            union {
+                BitField<0, 3, WrapMode> wrap_u;
+                BitField<3, 3, WrapMode> wrap_v;
+                BitField<6, 3, WrapMode> wrap_p;
+                BitField<9, 1, u32> depth_compare_enabled;
+                BitField<10, 3, DepthCompareFunc> depth_compare_func;
+                BitField<13, 1, u32> srgb_conversion;
+                BitField<20, 3, u32> max_anisotropy;
+            };
+            union {
+                BitField<0, 2, TextureFilter> mag_filter;
+                BitField<4, 2, TextureFilter> min_filter;
+                BitField<6, 2, TextureMipmapFilter> mipmap_filter;
+                BitField<9, 1, u32> cubemap_interface_filtering;
+                BitField<12, 13, u32> mip_lod_bias;
+            };
+            union {
+                BitField<0, 12, u32> min_lod_clamp;
+                BitField<12, 12, u32> max_lod_clamp;
+                BitField<24, 8, u32> srgb_border_color_r;
+            };
+            union {
+                BitField<12, 8, u32> srgb_border_color_g;
+                BitField<20, 8, u32> srgb_border_color_b;
+            };
+            std::array<f32, 4> border_color;
+        };
+        std::array<u8, 0x20> raw;
     };
-    union {
-        BitField<0, 2, TextureFilter> mag_filter;
-        BitField<4, 2, TextureFilter> min_filter;
-        BitField<6, 2, TextureMipmapFilter> mipmap_filter;
-        BitField<9, 1, u32> cubemap_interface_filtering;
-        BitField<12, 13, u32> mip_lod_bias;
-    };
-    union {
-        BitField<0, 12, u32> min_lod_clamp;
-        BitField<12, 12, u32> max_lod_clamp;
-        BitField<24, 8, u32> srgb_border_color_r;
-    };
-    union {
-        BitField<12, 8, u32> srgb_border_color_g;
-        BitField<20, 8, u32> srgb_border_color_b;
-    };
-    std::array<f32, 4> border_color;
 
     float GetMaxAnisotropy() const {
         return static_cast<float>(1U << max_anisotropy);
@@ -324,7 +329,7 @@ struct TSCEntry {
     float GetLodBias() const {
         // Sign extend the 13-bit value.
         constexpr u32 mask = 1U << (13 - 1);
-        return static_cast<float>((mip_lod_bias ^ mask) - mask) / 256.0f;
+        return static_cast<s32>((mip_lod_bias ^ mask) - mask) / 256.0f;
     }
 
     std::array<float, 4> GetBorderColor() const {