63 files changed, 2128 insertions, 1197 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 14b76680f..242a0d1cd 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -128,7 +128,9 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_scheduler.cpp
         renderer_vulkan/vk_scheduler.h
         renderer_vulkan/vk_stream_buffer.cpp
-        renderer_vulkan/vk_stream_buffer.h)
+        renderer_vulkan/vk_stream_buffer.h
+        renderer_vulkan/vk_swapchain.cpp
+        renderer_vulkan/vk_swapchain.h)
 
     target_include_directories(video_core PRIVATE ../../externals/Vulkan-Headers/include)
     target_compile_definitions(video_core PRIVATE HAS_VULKAN)
@@ -137,4 +139,4 @@ endif()
 create_target_directory_groups(video_core)
 
 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad lz4_static)
+target_link_libraries(video_core PRIVATE glad)
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 5ffb492ea..f0ef67535 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -10,7 +10,7 @@ namespace Tegra {
 
 void DebugContext::DoOnEvent(Event event, void* data) {
     {
-        std::unique_lock<std::mutex> lock(breakpoint_mutex);
+        std::unique_lock lock{breakpoint_mutex};
 
         // TODO(Subv): Commit the rasterizer's caches so framebuffers, render targets, etc. will
         // show on debug widgets
@@ -32,7 +32,7 @@ void DebugContext::DoOnEvent(Event event, void* data) {
 
 void DebugContext::Resume() {
     {
-        std::lock_guard<std::mutex> lock(breakpoint_mutex);
+        std::lock_guard lock{breakpoint_mutex};
 
         // Tell all observers that we are about to resume
         for (auto& breakpoint_observer : breakpoint_observers) {
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index c235faf46..ac3a2eb01 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -40,7 +40,7 @@ public:
         /// Constructs the object such that it observes events of the given DebugContext.
         explicit BreakPointObserver(std::shared_ptr<DebugContext> debug_context)
             : context_weak(debug_context) {
-            std::unique_lock<std::mutex> lock(debug_context->breakpoint_mutex);
+            std::unique_lock lock{debug_context->breakpoint_mutex};
             debug_context->breakpoint_observers.push_back(this);
         }
 
@@ -48,7 +48,7 @@ public:
             auto context = context_weak.lock();
             if (context) {
                 {
-                    std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
+                    std::unique_lock lock{context->breakpoint_mutex};
                     context->breakpoint_observers.remove(this);
                 }
 
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index bff1a37ff..8b1bea1ae 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -55,12 +55,9 @@ bool DmaPusher::Step() {
     }
 
     // Push buffer non-empty, read a word
-    const auto address = gpu.MemoryManager().GpuToCpuAddress(dma_get);
-    ASSERT_MSG(address, "Invalid GPU address");
-
     command_headers.resize(command_list_header.size);
-
-    Memory::ReadBlock(*address, command_headers.data(), command_list_header.size * sizeof(u32));
+    gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
+                                  command_list_header.size * sizeof(u32));
 
     for (const CommandHeader& command_header : command_headers) {
 
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 27a36348c..6ab06518f 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -9,7 +9,6 @@
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
-#include "video_core/memory_manager.h"
 
 namespace Tegra {
 
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 03b7ee5d8..55966eef1 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,12 +6,13 @@
 #include "common/logging/log.h"
 #include "common/math_util.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
 namespace Tegra::Engines {
 
 Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : memory_manager(memory_manager), rasterizer{rasterizer} {}
+    : rasterizer{rasterizer}, memory_manager{memory_manager} {}
 
 void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 80523e320..2e51b7f13 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -10,7 +10,10 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
+
+namespace Tegra {
+class MemoryManager;
+}
 
 namespace VideoCore {
 class RasterizerInterface;
@@ -115,10 +118,9 @@ public:
         };
     } regs{};
 
-    MemoryManager& memory_manager;
-
 private:
     VideoCore::RasterizerInterface& rasterizer;
+    MemoryManager& memory_manager;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 6575afd0f..fb6cdf432 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -9,7 +9,10 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
+
+namespace Tegra {
+class MemoryManager;
+}
 
 namespace Tegra::Engines {
 
@@ -40,10 +43,11 @@ public:
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
                   "KeplerCompute Regs has wrong size");
 
-    MemoryManager& memory_manager;
-
     /// Write the value to the register identified by method.
     void CallMethod(const GPU::MethodCall& method_call);
+
+private:
+    MemoryManager& memory_manager;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index aae2a4019..cd51a31d7 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -5,16 +5,17 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra::Engines {
 
 KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                            MemoryManager& memory_manager)
-    : system{system}, memory_manager(memory_manager), rasterizer{rasterizer} {}
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
@@ -40,17 +41,13 @@ void KeplerMemory::ProcessData(u32 data) {
     ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
     ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
 
-    const GPUVAddr address = regs.dest.Address();
-    const auto dest_address =
-        memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32));
-    ASSERT_MSG(dest_address, "Invalid GPU address");
-
     // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might contain
-    // a dirty surface that will have to be written back to memory.
-    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32));
+    // We do this before actually writing the new data because the destination address might
+    // contain a dirty surface that will have to be written back to memory.
+    const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
+    rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
+    memory_manager.Write<u32>(address, data);
 
-    Memory::Write32(*dest_address, data);
     system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
 
     state.write_offset++;
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 9181e9d80..78b6c3e45 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -10,12 +10,15 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
 
 namespace Core {
 class System;
 }
 
+namespace Tegra {
+class MemoryManager;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -82,8 +85,8 @@ public:
 
 private:
     Core::System& system;
-    MemoryManager& memory_manager;
     VideoCore::RasterizerInterface& rasterizer;
+    MemoryManager& memory_manager;
 
     void ProcessData(u32 data);
 };
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 144e7fa82..74403eed4 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -7,11 +7,10 @@
 #include "common/assert.h"
 #include "core/core.h"
 #include "core/core_timing.h"
-#include "core/memory.h"
 #include "video_core/debug_utils/debug_utils.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
-#include "video_core/renderer_base.h"
 #include "video_core/textures/texture.h"
 
 namespace Tegra::Engines {
@@ -21,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
-    : memory_manager(memory_manager), system{system}, rasterizer{rasterizer},
-      macro_interpreter(*this) {
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
+                                                                                  *this} {
     InitializeRegisterDefaults();
 }
 
@@ -250,6 +249,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryGet();
         break;
     }
+    case MAXWELL3D_REG_INDEX(sync_info): {
+        ProcessSyncPoint();
+        break;
+    }
     default:
         break;
     }
@@ -270,11 +273,9 @@ void Maxwell3D::ProcessMacroBind(u32 data) {
 }
 
 void Maxwell3D::ProcessQueryGet() {
-    GPUVAddr sequence_address = regs.query.QueryAddress();
+    const GPUVAddr sequence_address{regs.query.QueryAddress()};
     // Since the sequence address is given as a GPU VAddr, we have to convert it to an application
     // VAddr before writing.
-    const auto address = memory_manager.GpuToCpuAddress(sequence_address);
-    ASSERT_MSG(address, "Invalid GPU address");
 
     // TODO(Subv): Support the other query units.
     ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
@@ -309,7 +310,7 @@ void Maxwell3D::ProcessQueryGet() {
             // Write the current query sequence to the sequence address.
             // TODO(Subv): Find out what happens if you use a long query type but mark it as a short
             // query.
-            Memory::Write32(*address, sequence);
+            memory_manager.Write<u32>(sequence_address, sequence);
         } else {
             // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
             // GPU, this command may actually take a while to complete in real hardware due to GPU
@@ -318,7 +319,7 @@ void Maxwell3D::ProcessQueryGet() {
             query_result.value = result;
             // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming
             query_result.timestamp = system.CoreTiming().GetTicks();
-            Memory::WriteBlock(*address, &query_result, sizeof(query_result));
+            memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
         }
         dirty_flags.OnMemoryWrite();
         break;
@@ -329,6 +330,14 @@ void Maxwell3D::ProcessQueryGet() {
     }
 }
 
+void Maxwell3D::ProcessSyncPoint() {
+    const u32 sync_point = regs.sync_info.sync_point.Value();
+    const u32 increment = regs.sync_info.increment.Value();
+    const u32 cache_flush = regs.sync_info.unknown.Value();
+    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
+              cache_flush);
+}
+
 void Maxwell3D::DrawArrays() {
     LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
               regs.vertex_buffer.count);
@@ -393,10 +402,12 @@ void Maxwell3D::ProcessCBData(u32 value) {
     // Don't allow writing past the end of the buffer.
     ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
 
-    const auto address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos);
-    ASSERT_MSG(address, "Invalid GPU address");
+    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+
+    u8* ptr{memory_manager.GetPointer(address)};
+    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
+    memory_manager.Write<u32>(address, value);
 
-    Memory::Write32(*address, value);
     dirty_flags.OnMemoryWrite();
 
     // Increment the current buffer position.
@@ -404,14 +415,10 @@ void Maxwell3D::ProcessCBData(u32 value) {
 }
 
 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
-    const GPUVAddr tic_base_address = regs.tic.TICAddress();
-
-    const GPUVAddr tic_address_gpu = tic_base_address + tic_index * sizeof(Texture::TICEntry);
-    const auto tic_address_cpu = memory_manager.GpuToCpuAddress(tic_address_gpu);
-    ASSERT_MSG(tic_address_cpu, "Invalid GPU address");
+    const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
 
     Texture::TICEntry tic_entry;
-    Memory::ReadBlock(*tic_address_cpu, &tic_entry, sizeof(Texture::TICEntry));
+    memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
     ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
                    tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
@@ -429,14 +436,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
 }
 
 Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
-    const GPUVAddr tsc_base_address = regs.tsc.TSCAddress();
-
-    const GPUVAddr tsc_address_gpu = tsc_base_address + tsc_index * sizeof(Texture::TSCEntry);
-    const auto tsc_address_cpu = memory_manager.GpuToCpuAddress(tsc_address_gpu);
-    ASSERT_MSG(tsc_address_cpu, "Invalid GPU address");
+    const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
 
     Texture::TSCEntry tsc_entry;
-    Memory::ReadBlock(*tsc_address_cpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
     return tsc_entry;
 }
 
@@ -455,10 +458,7 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
     for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset;
          current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) {
 
-        const auto address = memory_manager.GpuToCpuAddress(current_texture);
-        ASSERT_MSG(address, "Invalid GPU address");
-
-        const Texture::TextureHandle tex_handle{Memory::Read32(*address)};
+        const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(current_texture)};
 
         Texture::FullTextureInfo tex_info{};
         // TODO(Subv): Use the shader to determine which textures are actually accessed.
@@ -493,10 +493,7 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
 
     ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
 
-    const auto tex_address_cpu = memory_manager.GpuToCpuAddress(tex_info_address);
-    ASSERT_MSG(tex_address_cpu, "Invalid GPU address");
-
-    const Texture::TextureHandle tex_handle{Memory::Read32(*tex_address_cpu)};
+    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
 
     Texture::FullTextureInfo tex_info{};
     tex_info.index = static_cast<u32>(offset);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 7fbf1026e..321af3297 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -16,13 +16,16 @@
 #include "common/math_util.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
-#include "video_core/memory_manager.h"
 #include "video_core/textures/texture.h"
 
 namespace Core {
 class System;
 }
 
+namespace Tegra {
+class MemoryManager;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -576,7 +579,17 @@ public:
                     u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x188);
+                INSERT_PADDING_WORDS(0x69);
+
+                struct {
+                    union {
+                        BitField<0, 16, u32> sync_point;
+                        BitField<16, 1, u32> unknown;
+                        BitField<20, 1, u32> increment;
+                    };
+                } sync_info;
+
+                INSERT_PADDING_WORDS(0x11E);
 
                 u32 tfb_enabled;
 
@@ -1093,7 +1106,6 @@ public:
     };
 
     State state{};
-    MemoryManager& memory_manager;
 
     struct DirtyFlags {
         std::bitset<8> color_buffer{0xFF};
@@ -1141,6 +1153,8 @@ private:
 
     VideoCore::RasterizerInterface& rasterizer;
 
+    MemoryManager& memory_manager;
+
     /// Start offsets of each macro in macro_memory
     std::unordered_map<u32, u32> macro_offsets;
 
@@ -1180,6 +1194,9 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
+    /// Handles writes to syncing register.
+    void ProcessSyncPoint();
+
     /// Handles a write to the CB_DATA[i] register.
     void ProcessCBData(u32 value);
 
@@ -1195,6 +1212,7 @@ private:
                   "Field " #field_name " has invalid position")
 
 ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
 ASSERT_REG_POSITION(viewport_transform, 0x280);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 9dfea5999..2426d0067 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,17 +5,18 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
 MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                        MemoryManager& memory_manager)
-    : memory_manager(memory_manager), system{system}, rasterizer{rasterizer} {}
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
 
 void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -42,11 +43,6 @@ void MaxwellDMA::HandleCopy() {
     const GPUVAddr source = regs.src_address.Address();
     const GPUVAddr dest = regs.dst_address.Address();
 
-    const auto source_cpu = memory_manager.GpuToCpuAddress(source);
-    const auto dest_cpu = memory_manager.GpuToCpuAddress(dest);
-    ASSERT_MSG(source_cpu, "Invalid source GPU address");
-    ASSERT_MSG(dest_cpu, "Invalid destination GPU address");
-
     // TODO(Subv): Perform more research and implement all features of this engine.
     ASSERT(regs.exec.enable_swizzle == 0);
     ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
@@ -69,7 +65,7 @@ void MaxwellDMA::HandleCopy() {
         // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
         // y_count).
         if (!regs.exec.enable_2d) {
-            Memory::CopyBlock(*dest_cpu, *source_cpu, regs.x_count);
+            memory_manager.CopyBlock(dest, source, regs.x_count);
             return;
         }
 
@@ -78,9 +74,9 @@ void MaxwellDMA::HandleCopy() {
         // rectangle. There is no need to manually flush/invalidate the regions because
         // CopyBlock does that for us.
         for (u32 line = 0; line < regs.y_count; ++line) {
-            const VAddr source_line = *source_cpu + line * regs.src_pitch;
-            const VAddr dest_line = *dest_cpu + line * regs.dst_pitch;
-            Memory::CopyBlock(dest_line, source_line, regs.x_count);
+            const GPUVAddr source_line = source + line * regs.src_pitch;
+            const GPUVAddr dest_line = dest + line * regs.dst_pitch;
+            memory_manager.CopyBlock(dest_line, source_line, regs.x_count);
         }
         return;
     }
@@ -89,15 +85,28 @@ void MaxwellDMA::HandleCopy() {
 
     const std::size_t copy_size = regs.x_count * regs.y_count;
 
+    auto source_ptr{memory_manager.GetPointer(source)};
+    auto dst_ptr{memory_manager.GetPointer(dest)};
+
+    if (!source_ptr) {
+        LOG_ERROR(HW_GPU, "source_ptr is invalid");
+        return;
+    }
+
+    if (!dst_ptr) {
+        LOG_ERROR(HW_GPU, "dst_ptr is invalid");
+        return;
+    }
+
     const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
         // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
         // copying.
-        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size);
+        rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size);
 
         // We have to invalidate the destination region to evict any outdated surfaces from the
         // cache. We do this before actually writing the new data because the destination address
         // might contain a dirty surface that will have to be written back to memory.
-        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size);
+        rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size);
     };
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
@@ -110,8 +119,8 @@ void MaxwellDMA::HandleCopy() {
                            copy_size * src_bytes_per_pixel);
 
         Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, *source_cpu,
-                                  *dest_cpu, regs.src_params.BlockHeight(), regs.src_params.pos_x,
+                                  regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr,
+                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
                                   regs.src_params.pos_y);
     } else {
         ASSERT(regs.dst_params.size_z == 1);
@@ -124,7 +133,7 @@ void MaxwellDMA::HandleCopy() {
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
         Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bpp, *dest_cpu, *source_cpu, regs.dst_params.BlockHeight());
+                                src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight());
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 34c369320..c6b649842 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -10,12 +10,15 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
 
 namespace Core {
 class System;
 }
 
+namespace Tegra {
+class MemoryManager;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -139,13 +142,13 @@ public:
         };
     } regs{};
 
-    MemoryManager& memory_manager;
-
 private:
     Core::System& system;
 
     VideoCore::RasterizerInterface& rasterizer;
 
+    MemoryManager& memory_manager;
+
     /// Performs the copy from the source buffer to the destination buffer as configured in the
     /// registers.
     void HandleCopy();
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 7f613370b..363e53be1 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -1662,7 +1662,7 @@ private:
             INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"),
             INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
             INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
-            INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
+            INST("0011100-01000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
             INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"),
             INST("0000001---------", Id::LOP3_C, Type::ArithmeticInteger, "LOP3_C"),
             INST("0101101111100---", Id::LOP3_R, Type::ArithmeticInteger, "LOP3_R"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 08abf8ac9..4461083ff 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -12,6 +12,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
 
 namespace Tegra {
@@ -30,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
     auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>();
+    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
@@ -274,7 +275,6 @@ void GPU::ProcessSemaphoreTriggerMethod() {
     const auto op =
         static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
     if (op == GpuSemaphoreOperation::WriteLong) {
-        auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
         struct Block {
             u32 sequence;
             u32 zeros = 0;
@@ -286,11 +286,10 @@ void GPU::ProcessSemaphoreTriggerMethod() {
         // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
         // CoreTiming
         block.timestamp = Core::System::GetInstance().CoreTiming().GetTicks();
-        Memory::WriteBlock(*address, &block, sizeof(block));
+        memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
+                                   sizeof(block));
     } else {
-        const auto address =
-            memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
-        const u32 word = Memory::Read32(*address);
+        const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
         if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
             (op == GpuSemaphoreOperation::AcquireGequal &&
              static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
@@ -317,13 +316,11 @@ void GPU::ProcessSemaphoreTriggerMethod() {
 }
 
 void GPU::ProcessSemaphoreRelease() {
-    const auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
-    Memory::Write32(*address, regs.semaphore_release);
+    memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release);
 }
 
 void GPU::ProcessSemaphoreAcquire() {
-    const auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
-    const u32 word = Memory::Read32(*address);
+    const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
     const auto value = regs.semaphore_acquire;
     if (word != value) {
         regs.acquire_active = true;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 56a203275..de30ea354 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -9,7 +9,11 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/dma_pusher.h"
-#include "video_core/memory_manager.h"
+
+using CacheAddr = std::uintptr_t;
+inline CacheAddr ToCacheAddr(const void* host_ptr) {
+    return reinterpret_cast<CacheAddr>(host_ptr);
+}
 
 namespace Core {
 class System;
@@ -119,6 +123,8 @@ enum class EngineID {
     MAXWELL_DMA_COPY_A = 0xB0B5,
 };
 
+class MemoryManager;
+
 class GPU {
 public:
     explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
@@ -171,11 +177,11 @@ public:
                     u32 address_high;
                     u32 address_low;
 
-                    GPUVAddr SmaphoreAddress() const {
+                    GPUVAddr SemaphoreAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
-                } smaphore_address;
+                } semaphore_address;
 
                 u32 semaphore_sequence;
                 u32 semaphore_trigger;
@@ -209,13 +215,13 @@ public:
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
@@ -239,9 +245,8 @@ protected:
 private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
 
-    /// Mapping of command subchannels to their bound engine ids.
+    /// Mapping of command subchannels to their bound engine ids
     std::array<EngineID, 8> bound_engines = {};
-
     /// 3D engine
     std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
     /// 2D engine
@@ -258,7 +263,7 @@ private:
     static_assert(offsetof(GPU::Regs, field_name) == position * 4,                                 \
                   "Field " #field_name " has invalid position")
 
-ASSERT_REG_POSITION(smaphore_address, 0x4);
+ASSERT_REG_POSITION(semaphore_address, 0x4);
 ASSERT_REG_POSITION(semaphore_sequence, 0x6);
 ASSERT_REG_POSITION(semaphore_trigger, 0x7);
 ASSERT_REG_POSITION(reference_count, 0x14);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index ad0a747e3..db507cf04 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -9,7 +9,7 @@
 namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {}
+    : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
@@ -22,15 +22,15 @@ void GPUAsynch::SwapBuffers(
     gpu_thread.SwapBuffers(std::move(framebuffer));
 }
 
-void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
+void GPUAsynch::FlushRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushRegion(addr, size);
 }
 
-void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
+void GPUAsynch::InvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.InvalidateRegion(addr, size);
 }
 
-void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushAndInvalidateRegion(addr, size);
 }
 
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index e6a807aba..1dcc61a6c 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -26,9 +26,9 @@ public:
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 
 private:
     GPUThread::ThreadManager gpu_thread;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 4c00b96c7..2cfc900ed 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -22,15 +22,15 @@ void GPUSynch::SwapBuffers(
     renderer.SwapBuffers(std::move(framebuffer));
 }
 
-void GPUSynch::FlushRegion(VAddr addr, u64 size) {
+void GPUSynch::FlushRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().FlushRegion(addr, size);
 }
 
-void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
+void GPUSynch::InvalidateRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().InvalidateRegion(addr, size);
 }
 
-void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void GPUSynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
 }
 
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 7d5a241ff..766b5631c 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -21,9 +21,9 @@ public:
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c5bdd2a17..cc56cf467 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,8 +4,10 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
 #include "core/frontend/scope_acquire_window_context.h"
-#include "core/settings.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
@@ -13,38 +15,13 @@
 
 namespace VideoCommon::GPUThread {
 
-/// Executes a single GPU thread command
-static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
-                           Tegra::DmaPusher& dma_pusher) {
-    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
-        dma_pusher.Push(std::move(submit_list->entries));
-        dma_pusher.DispatchCalls();
-    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
-        renderer.SwapBuffers(data->framebuffer);
-    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
-        renderer.Rasterizer().FlushRegion(data->addr, data->size);
-    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
-        renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
-    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
-        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
-    } else {
-        UNREACHABLE();
-    }
-}
-
 /// Runs the GPU thread
 static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-
     MicroProfileOnThreadCreate("GpuThread");
 
-    auto WaitForWakeup = [&]() {
-        std::unique_lock<std::mutex> lock{state.signal_mutex};
-        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
-    };
-
     // Wait for first GPU command before acquiring the window context
-    WaitForWakeup();
+    state.WaitForCommands();
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {
@@ -53,99 +30,91 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 
     Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
 
+    CommandDataContainer next;
     while (state.is_running) {
-        if (!state.is_running) {
-            return;
-        }
-
-        {
-            // Thread has been woken up, so make the previous write queue the next read queue
-            std::lock_guard<std::mutex> lock{state.signal_mutex};
-            std::swap(state.push_queue, state.pop_queue);
+        state.WaitForCommands();
+        while (!state.queue.Empty()) {
+            state.queue.Pop(next);
+            if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
+                dma_pusher.Push(std::move(submit_list->entries));
+                dma_pusher.DispatchCalls();
+            } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
+                renderer.SwapBuffers(std::move(data->framebuffer));
+            } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
+                renderer.Rasterizer().FlushRegion(data->addr, data->size);
+            } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
+                renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
+            } else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) {
+                return;
+            } else {
+                UNREACHABLE();
+            }
+            state.signaled_fence = next.fence;
+            state.TrySynchronize();
         }
-
-        // Execute all of the GPU commands
-        while (!state.pop_queue->empty()) {
-            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
-            state.pop_queue->pop();
-        }
-
-        state.UpdateIdleState();
-
-        // Signal that the GPU thread has finished processing commands
-        if (state.is_idle) {
-            state.idle_condition.notify_one();
-        }
-
-        // Wait for CPU thread to send more GPU commands
-        WaitForWakeup();
     }
 }
 
-ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
-    : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
-                                                         std::ref(dma_pusher), std::ref(state)},
-      thread_id{thread.get_id()} {}
+ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
+                             Tegra::DmaPusher& dma_pusher)
+    : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} {
+    synchronization_event = system.CoreTiming().RegisterEvent(
+        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
+}
 
 ThreadManager::~ThreadManager() {
-    {
-        // Notify GPU thread that a shutdown is pending
-        std::lock_guard<std::mutex> lock{state.signal_mutex};
-        state.is_running = false;
-    }
-
-    state.signal_condition.notify_one();
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
     thread.join();
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
-    if (entries.empty()) {
-        return;
-    }
-
-    PushCommand(SubmitListCommand(std::move(entries)), false, false);
+    const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
+    const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
+    system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
 }
 
 void ThreadManager::SwapBuffers(
     std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false);
+    PushCommand(SwapBuffersCommand(std::move(framebuffer)));
 }
 
-void ThreadManager::FlushRegion(VAddr addr, u64 size) {
-    // Block the CPU when using accurate emulation
-    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
+void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
+    PushCommand(FlushRegionCommand(addr, size));
 }
 
-void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
-    PushCommand(InvalidateRegionCommand(addr, size), true, true);
+void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
+    if (state.queue.Empty()) {
+        // It's quicker to invalidate a single region on the CPU if the queue is already empty
+        system.Renderer().Rasterizer().InvalidateRegion(addr, size);
+    } else {
+        PushCommand(InvalidateRegionCommand(addr, size));
+    }
 }
 
-void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
+    // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
     InvalidateRegion(addr, size);
 }
 
-void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) {
-    {
-        std::lock_guard<std::mutex> lock{state.signal_mutex};
-
-        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
-            // Execute the command synchronously on the current thread
-            ExecuteCommand(&command_data, renderer, dma_pusher);
-            return;
-        }
+u64 ThreadManager::PushCommand(CommandData&& command_data) {
+    const u64 fence{++state.last_fence};
+    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
+    state.SignalCommands();
+    return fence;
+}
 
-        // Push the command to the GPU thread
-        state.UpdateIdleState();
-        state.push_queue->emplace(command_data);
+MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
+void SynchState::WaitForSynchronization(u64 fence) {
+    if (signaled_fence >= fence) {
+        return;
     }
 
-    // Signal the GPU thread that commands are pending
-    state.signal_condition.notify_one();
-
-    if (wait_for_idle) {
-        // Wait for the GPU to be idle (all commands to be executed)
-        std::unique_lock<std::mutex> lock{state.idle_mutex};
-        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
+    // Wait for the GPU to be idle (all commands to be executed)
+    {
+        MICROPROFILE_SCOPE(GPU_wait);
+        std::unique_lock<std::mutex> lock{synchronization_mutex};
+        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
     }
 }
 
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index edb148b14..62bcea5bb 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -4,26 +4,33 @@
 
 #pragma once
 
-#include <array>
 #include <atomic>
 #include <condition_variable>
-#include <memory>
 #include <mutex>
 #include <optional>
 #include <thread>
 #include <variant>
 
+#include "common/threadsafe_queue.h"
+#include "video_core/gpu.h"
+
 namespace Tegra {
 struct FramebufferConfig;
 class DmaPusher;
 } // namespace Tegra
 
-namespace VideoCore {
-class RendererBase;
-} // namespace VideoCore
+namespace Core {
+class System;
+namespace Timing {
+struct EventType;
+} // namespace Timing
+} // namespace Core
 
 namespace VideoCommon::GPUThread {
 
+/// Command to signal to the GPU thread that processing has ended
+struct EndProcessingCommand final {};
+
 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
     explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
@@ -36,65 +43,103 @@ struct SwapBuffersCommand final {
     explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
         : framebuffer{std::move(framebuffer)} {}
 
-    std::optional<const Tegra::FramebufferConfig> framebuffer;
+    std::optional<Tegra::FramebufferConfig> framebuffer;
 };
 
 /// Command to signal to the GPU thread to flush a region
 struct FlushRegionCommand final {
-    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr FlushRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
 /// Command to signal to the GPU thread to invalidate a region
 struct InvalidateRegionCommand final {
-    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr InvalidateRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
 /// Command to signal to the GPU thread to flush and invalidate a region
 struct FlushAndInvalidateRegionCommand final {
-    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
+    explicit constexpr FlushAndInvalidateRegionCommand(CacheAddr addr, u64 size)
         : addr{addr}, size{size} {}
 
-    const VAddr addr;
-    const u64 size;
+    CacheAddr addr;
+    u64 size;
 };
 
-using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+using CommandData =
+    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+
+struct CommandDataContainer {
+    CommandDataContainer() = default;
+
+    CommandDataContainer(CommandData&& data, u64 next_fence)
+        : data{std::move(data)}, fence{next_fence} {}
+
+    CommandDataContainer& operator=(const CommandDataContainer& t) {
+        data = std::move(t.data);
+        fence = t.fence;
+        return *this;
+    }
+
+    CommandData data;
+    u64 fence{};
+};
 
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
-    std::atomic<bool> is_running{true};
-    std::atomic<bool> is_idle{true};
-    std::condition_variable signal_condition;
-    std::mutex signal_mutex;
-    std::condition_variable idle_condition;
-    std::mutex idle_mutex;
-
-    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and
-    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes
-    // empty. This allows for efficient thread-safe access, as it does not require any copies.
-
-    using CommandQueue = std::queue<CommandData>;
-    std::array<CommandQueue, 2> command_queues;
-    CommandQueue* push_queue{&command_queues[0]};
-    CommandQueue* pop_queue{&command_queues[1]};
-
-    void UpdateIdleState() {
-        std::lock_guard<std::mutex> lock{idle_mutex};
-        is_idle = command_queues[0].empty() && command_queues[1].empty();
+    std::atomic_bool is_running{true};
+    std::atomic_int queued_frame_count{};
+    std::mutex synchronization_mutex;
+    std::mutex commands_mutex;
+    std::condition_variable commands_condition;
+    std::condition_variable synchronization_condition;
+
+    /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
+    /// synchronized. This is entirely empirical.
+    bool IsSynchronized() const {
+        constexpr std::size_t max_queue_gap{5};
+        return queue.Size() <= max_queue_gap;
+    }
+
+    void TrySynchronize() {
+        if (IsSynchronized()) {
+            std::lock_guard<std::mutex> lock{synchronization_mutex};
+            synchronization_condition.notify_one();
+        }
     }
+
+    void WaitForSynchronization(u64 fence);
+
+    void SignalCommands() {
+        if (queue.Empty()) {
+            return;
+        }
+
+        commands_condition.notify_one();
+    }
+
+    void WaitForCommands() {
+        std::unique_lock lock{commands_mutex};
+        commands_condition.wait(lock, [this] { return !queue.Empty(); });
+    }
+
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    CommandQueue queue;
+    u64 last_fence{};
+    std::atomic<u64> signaled_fence{};
 };
 
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
-    explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+    explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
+                           Tegra::DmaPusher& dma_pusher);
     ~ThreadManager();
 
     /// Push GPU command entries to be processed
@@ -105,27 +150,22 @@ public:
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    void FlushRegion(VAddr addr, u64 size);
+    void FlushRegion(CacheAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    void InvalidateRegion(VAddr addr, u64 size);
+    void InvalidateRegion(CacheAddr addr, u64 size);
 
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size);
 
 private:
     /// Pushes a command to be executed by the GPU thread
-    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu);
-
-    /// Returns true if this is called by the GPU thread
-    bool IsGpuThread() const {
-        return std::this_thread::get_id() == thread_id;
-    }
+    u64 PushCommand(CommandData&& command_data);
 
 private:
     SynchState state;
-    VideoCore::RendererBase& renderer;
-    Tegra::DmaPusher& dma_pusher;
+    Core::System& system;
+    Core::Timing::EventType* synchronization_event{};
     std::thread thread;
     std::thread::id thread_id;
 };
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 64f75db43..524d9ea5a 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -223,27 +223,21 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
 }
 
 u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < parameters.size());
-    return parameters[next_parameter_index++];
+    return parameters.at(next_parameter_index++);
 }
 
 u32 MacroInterpreter::GetRegister(u32 register_id) const {
-    // Register 0 is supposed to always return 0.
-    if (register_id == 0)
-        return 0;
-
-    ASSERT(register_id < registers.size());
-    return registers[register_id];
+    return registers.at(register_id);
 }
 
 void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
-    // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
-    // register.
-    if (register_id == 0)
+    // Register 0 is hardwired as the zero register.
+    // Ensure no writes to it actually occur.
+    if (register_id == 0) {
         return;
+    }
 
-    ASSERT(register_id < registers.size());
-    registers[register_id] = value;
+    registers.at(register_id) = value;
 }
 
 void MacroInterpreter::SetMethodAddress(u32 address) {
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 54abe5298..0f4e820aa 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,181 +5,528 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/memory.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
 
 namespace Tegra {
 
-MemoryManager::MemoryManager() {
-    // Mark the first page as reserved, so that 0 is not a valid GPUVAddr. Otherwise, games might
-    // try to use 0 as a valid address, which is also used to mean nullptr. This fixes a bug with
-    // Undertale using 0 for a render target.
-    PageSlot(0) = static_cast<u64>(PageStatus::Reserved);
+MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
+    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
+    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
+              Common::PageType::Unmapped);
+    page_table.Resize(address_space_width);
+
+    // Initialize the map with a single free region covering the entire managed space.
+    VirtualMemoryArea initial_vma;
+    initial_vma.size = address_space_end;
+    vma_map.emplace(initial_vma.base, initial_vma);
+
+    UpdatePageTableForVMA(initial_vma);
 }
 
 GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) {
-    const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, align, PageStatus::Unmapped)};
+    const u64 aligned_size{Common::AlignUp(size, page_size)};
+    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
 
-    ASSERT_MSG(gpu_addr, "unable to find available GPU memory");
+    AllocateMemory(gpu_addr, 0, aligned_size);
 
-    for (u64 offset{}; offset < size; offset += PAGE_SIZE) {
-        VAddr& slot{PageSlot(*gpu_addr + offset)};
+    return gpu_addr;
+}
 
-        ASSERT(slot == static_cast<u64>(PageStatus::Unmapped));
+GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) {
+    const u64 aligned_size{Common::AlignUp(size, page_size)};
 
-        slot = static_cast<u64>(PageStatus::Allocated);
-    }
+    AllocateMemory(gpu_addr, 0, aligned_size);
 
-    return *gpu_addr;
+    return gpu_addr;
 }
 
-GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) {
-    for (u64 offset{}; offset < size; offset += PAGE_SIZE) {
-        VAddr& slot{PageSlot(gpu_addr + offset)};
+GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
+    const u64 aligned_size{Common::AlignUp(size, page_size)};
+    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
 
-        ASSERT(slot == static_cast<u64>(PageStatus::Unmapped));
+    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
 
-        slot = static_cast<u64>(PageStatus::Allocated);
-    }
+    return gpu_addr;
+}
+
+GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) {
+    ASSERT((gpu_addr & page_mask) == 0);
+
+    const u64 aligned_size{Common::AlignUp(size, page_size)};
+
+    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
 
     return gpu_addr;
 }
 
-GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
-    const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, PAGE_SIZE, PageStatus::Unmapped)};
+GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
+    ASSERT((gpu_addr & page_mask) == 0);
+
+    const u64 aligned_size{Common::AlignUp(size, page_size)};
+    const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
+
+    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
+    UnmapRange(gpu_addr, aligned_size);
 
-    ASSERT_MSG(gpu_addr, "unable to find available GPU memory");
+    return gpu_addr;
+}
 
-    for (u64 offset{}; offset < size; offset += PAGE_SIZE) {
-        VAddr& slot{PageSlot(*gpu_addr + offset)};
+GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) const {
+    // Find the first Free VMA.
+    const VMAHandle vma_handle{
+        std::find_if(vma_map.begin(), vma_map.end(), [region_start, size](const auto& vma) {
+            if (vma.second.type != VirtualMemoryArea::Type::Unmapped) {
+                return false;
+            }
 
-        ASSERT(slot == static_cast<u64>(PageStatus::Unmapped));
+            const VAddr vma_end{vma.second.base + vma.second.size};
+            return vma_end > region_start && vma_end >= region_start + size;
+        })};
 
-        slot = cpu_addr + offset;
+    if (vma_handle == vma_map.end()) {
+        return {};
     }
 
-    const MappedRegion region{cpu_addr, *gpu_addr, size};
-    mapped_regions.push_back(region);
+    return std::max(region_start, vma_handle->second.base);
+}
 
-    return *gpu_addr;
+bool MemoryManager::IsAddressValid(GPUVAddr addr) const {
+    return (addr >> page_bits) < page_table.pointers.size();
 }
 
-GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) {
-    ASSERT((gpu_addr & PAGE_MASK) == 0);
+std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) const {
+    if (!IsAddressValid(addr)) {
+        return {};
+    }
 
-    if (PageSlot(gpu_addr) != static_cast<u64>(PageStatus::Allocated)) {
-        // Page has been already mapped. In this case, we must find a new area of memory to use that
-        // is different than the specified one. Super Mario Odyssey hits this scenario when changing
-        // areas, but we do not want to overwrite the old pages.
-        // TODO(bunnei): We need to write a hardware test to confirm this behavior.
+    const VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]};
+    if (cpu_addr) {
+        return cpu_addr + (addr & page_mask);
+    }
+
+    return {};
+}
 
-        LOG_ERROR(HW_GPU, "attempting to map addr 0x{:016X}, which is not available!", gpu_addr);
+template <typename T>
+T MemoryManager::Read(GPUVAddr addr) const {
+    if (!IsAddressValid(addr)) {
+        return {};
+    }
 
-        const std::optional<GPUVAddr> new_gpu_addr{
-            FindFreeBlock(gpu_addr, size, PAGE_SIZE, PageStatus::Allocated)};
+    const u8* page_pointer{page_table.pointers[addr >> page_bits]};
+    if (page_pointer) {
+        // NOTE: Avoid adding any extra logic to this fast-path block
+        T value;
+        std::memcpy(&value, &page_pointer[addr & page_mask], sizeof(T));
+        return value;
+    }
 
-        ASSERT_MSG(new_gpu_addr, "unable to find available GPU memory");
+    switch (page_table.attributes[addr >> page_bits]) {
+    case Common::PageType::Unmapped:
+        LOG_ERROR(HW_GPU, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, addr);
+        return 0;
+    case Common::PageType::Memory:
+        ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr);
+        break;
+    default:
+        UNREACHABLE();
+    }
+    return {};
+}
 
-        gpu_addr = *new_gpu_addr;
+template <typename T>
+void MemoryManager::Write(GPUVAddr addr, T data) {
+    if (!IsAddressValid(addr)) {
+        return;
     }
 
-    for (u64 offset{}; offset < size; offset += PAGE_SIZE) {
-        VAddr& slot{PageSlot(gpu_addr + offset)};
+    u8* page_pointer{page_table.pointers[addr >> page_bits]};
+    if (page_pointer) {
+        // NOTE: Avoid adding any extra logic to this fast-path block
+        std::memcpy(&page_pointer[addr & page_mask], &data, sizeof(T));
+        return;
+    }
 
-        ASSERT(slot == static_cast<u64>(PageStatus::Allocated));
+    switch (page_table.attributes[addr >> page_bits]) {
+    case Common::PageType::Unmapped:
+        LOG_ERROR(HW_GPU, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
+                  static_cast<u32>(data), addr);
+        return;
+    case Common::PageType::Memory:
+        ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr);
+        break;
+    default:
+        UNREACHABLE();
+    }
+}
 
-        slot = cpu_addr + offset;
+template u8 MemoryManager::Read<u8>(GPUVAddr addr) const;
+template u16 MemoryManager::Read<u16>(GPUVAddr addr) const;
+template u32 MemoryManager::Read<u32>(GPUVAddr addr) const;
+template u64 MemoryManager::Read<u64>(GPUVAddr addr) const;
+template void MemoryManager::Write<u8>(GPUVAddr addr, u8 data);
+template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data);
+template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data);
+template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data);
+
+u8* MemoryManager::GetPointer(GPUVAddr addr) {
+    if (!IsAddressValid(addr)) {
+        return {};
     }
 
-    const MappedRegion region{cpu_addr, gpu_addr, size};
-    mapped_regions.push_back(region);
+    u8* const page_pointer{page_table.pointers[addr >> page_bits]};
+    if (page_pointer != nullptr) {
+        return page_pointer + (addr & page_mask);
+    }
 
-    return gpu_addr;
+    LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr);
+    return {};
 }
 
-GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
-    ASSERT((gpu_addr & PAGE_MASK) == 0);
+const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
+    if (!IsAddressValid(addr)) {
+        return {};
+    }
 
-    for (u64 offset{}; offset < size; offset += PAGE_SIZE) {
-        VAddr& slot{PageSlot(gpu_addr + offset)};
+    const u8* const page_pointer{page_table.pointers[addr >> page_bits]};
+    if (page_pointer != nullptr) {
+        return page_pointer + (addr & page_mask);
+    }
 
-        ASSERT(slot != static_cast<u64>(PageStatus::Allocated) &&
-               slot != static_cast<u64>(PageStatus::Unmapped));
+    LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr);
+    return {};
+}
 
-        slot = static_cast<u64>(PageStatus::Unmapped);
-    }
+void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            std::memcpy(dest_buffer, src_ptr, copy_amount);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
 
-    // Delete the region mappings that are contained within the unmapped region
-    mapped_regions.erase(std::remove_if(mapped_regions.begin(), mapped_regions.end(),
-                                        [&](const MappedRegion& region) {
-                                            return region.gpu_addr <= gpu_addr &&
-                                                   region.gpu_addr + region.size < gpu_addr + size;
-                                        }),
-                         mapped_regions.end());
-    return gpu_addr;
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
 }
 
-GPUVAddr MemoryManager::GetRegionEnd(GPUVAddr region_start) const {
-    for (const auto& region : mapped_regions) {
-        const GPUVAddr region_end{region.gpu_addr + region.size};
-        if (region_start >= region.gpu_addr && region_start < region_end) {
-            return region_end;
+void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{dest_addr >> page_bits};
+    std::size_t page_offset{dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
+            std::memcpy(dest_ptr, src_buffer, copy_amount);
+            break;
         }
+        default:
+            UNREACHABLE();
+        }
+
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
     }
-    return {};
 }
 
-std::optional<GPUVAddr> MemoryManager::FindFreeBlock(GPUVAddr region_start, u64 size, u64 align,
-                                                     PageStatus status) {
-    GPUVAddr gpu_addr{region_start};
-    u64 free_space{};
-    align = (align + PAGE_MASK) & ~PAGE_MASK;
+void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            WriteBlock(dest_addr, src_ptr, copy_amount);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
 
-    while (gpu_addr + free_space < MAX_ADDRESS) {
-        if (PageSlot(gpu_addr + free_space) == static_cast<u64>(status)) {
-            free_space += PAGE_SIZE;
-            if (free_space >= size) {
-                return gpu_addr;
-            }
-        } else {
-            gpu_addr += free_space + PAGE_SIZE;
-            free_space = 0;
-            gpu_addr = Common::AlignUp(gpu_addr, align);
+        page_index++;
+        page_offset = 0;
+        dest_addr += static_cast<VAddr>(copy_amount);
+        src_addr += static_cast<VAddr>(copy_amount);
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
+                             VAddr backing_addr) {
+    LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
+              (base + size) * page_size);
+
+    const VAddr end{base + size};
+    ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}",
+               base + page_table.pointers.size());
+
+    std::fill(page_table.attributes.begin() + base, page_table.attributes.begin() + end, type);
+
+    if (memory == nullptr) {
+        std::fill(page_table.pointers.begin() + base, page_table.pointers.begin() + end, memory);
+        std::fill(page_table.backing_addr.begin() + base, page_table.backing_addr.begin() + end,
+                  backing_addr);
+    } else {
+        while (base != end) {
+            page_table.pointers[base] = memory;
+            page_table.backing_addr[base] = backing_addr;
+
+            base += 1;
+            memory += page_size;
+            backing_addr += page_size;
         }
     }
+}
 
-    return {};
+void MemoryManager::MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr) {
+    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size);
+    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base);
+    MapPages(base / page_size, size / page_size, target, Common::PageType::Memory, backing_addr);
 }
 
-std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) {
-    const VAddr base_addr{PageSlot(gpu_addr)};
+void MemoryManager::UnmapRegion(GPUVAddr base, u64 size) {
+    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size);
+    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base);
+    MapPages(base / page_size, size / page_size, nullptr, Common::PageType::Unmapped);
+}
 
-    if (base_addr == static_cast<u64>(PageStatus::Allocated) ||
-        base_addr == static_cast<u64>(PageStatus::Unmapped) ||
-        base_addr == static_cast<u64>(PageStatus::Reserved)) {
+bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
+    ASSERT(base + size == next.base);
+    if (type != next.type) {
+        return {};
+    }
+    if (type == VirtualMemoryArea::Type::Allocated && (offset + size != next.offset)) {
+        return {};
+    }
+    if (type == VirtualMemoryArea::Type::Mapped && backing_memory + size != next.backing_memory) {
         return {};
     }
+    return true;
+}
+
+MemoryManager::VMAHandle MemoryManager::FindVMA(GPUVAddr target) const {
+    if (target >= address_space_end) {
+        return vma_map.end();
+    } else {
+        return std::prev(vma_map.upper_bound(target));
+    }
+}
+
+MemoryManager::VMAIter MemoryManager::Allocate(VMAIter vma_handle) {
+    VirtualMemoryArea& vma{vma_handle->second};
+
+    vma.type = VirtualMemoryArea::Type::Allocated;
+    vma.backing_addr = 0;
+    vma.backing_memory = {};
+    UpdatePageTableForVMA(vma);
+
+    return MergeAdjacent(vma_handle);
+}
+
+MemoryManager::VMAHandle MemoryManager::AllocateMemory(GPUVAddr target, std::size_t offset,
+                                                       u64 size) {
+
+    // This is the appropriately sized VMA that will turn into our allocation.
+    VMAIter vma_handle{CarveVMA(target, size)};
+    VirtualMemoryArea& vma{vma_handle->second};
+
+    ASSERT(vma.size == size);
+
+    vma.offset = offset;
+
+    return Allocate(vma_handle);
+}
+
+MemoryManager::VMAHandle MemoryManager::MapBackingMemory(GPUVAddr target, u8* memory, u64 size,
+                                                         VAddr backing_addr) {
+    // This is the appropriately sized VMA that will turn into our allocation.
+    VMAIter vma_handle{CarveVMA(target, size)};
+    VirtualMemoryArea& vma{vma_handle->second};
+
+    ASSERT(vma.size == size);
+
+    vma.type = VirtualMemoryArea::Type::Mapped;
+    vma.backing_memory = memory;
+    vma.backing_addr = backing_addr;
+    UpdatePageTableForVMA(vma);
+
+    return MergeAdjacent(vma_handle);
+}
+
+void MemoryManager::UnmapRange(GPUVAddr target, u64 size) {
+    VMAIter vma{CarveVMARange(target, size)};
+    const VAddr target_end{target + size};
+    const VMAIter end{vma_map.end()};
+
+    // The comparison against the end of the range must be done using addresses since VMAs can be
+    // merged during this process, causing invalidation of the iterators.
+    while (vma != end && vma->second.base < target_end) {
+        // Unmapped ranges return to allocated state and can be reused
+        // This behavior is used by Super Mario Odyssey, Sonic Forces, and likely other games
+        vma = std::next(Allocate(vma));
+    }
+
+    ASSERT(FindVMA(target)->second.size >= size);
+}
 
-    return base_addr + (gpu_addr & PAGE_MASK);
+MemoryManager::VMAIter MemoryManager::StripIterConstness(const VMAHandle& iter) {
+    // This uses a neat C++ trick to convert a const_iterator to a regular iterator, given
+    // non-const access to its container.
+    return vma_map.erase(iter, iter); // Erases an empty range of elements
 }
 
-std::vector<GPUVAddr> MemoryManager::CpuToGpuAddress(VAddr cpu_addr) const {
-    std::vector<GPUVAddr> results;
-    for (const auto& region : mapped_regions) {
-        if (cpu_addr >= region.cpu_addr && cpu_addr < (region.cpu_addr + region.size)) {
-            const u64 offset{cpu_addr - region.cpu_addr};
-            results.push_back(region.gpu_addr + offset);
+MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) {
+    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size);
+    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: 0x{:016X}", base);
+
+    VMAIter vma_handle{StripIterConstness(FindVMA(base))};
+    if (vma_handle == vma_map.end()) {
+        // Target address is outside the managed range
+        return {};
+    }
+
+    const VirtualMemoryArea& vma{vma_handle->second};
+    if (vma.type == VirtualMemoryArea::Type::Mapped) {
+        // Region is already allocated
+        return vma_handle;
+    }
+
+    const VAddr start_in_vma{base - vma.base};
+    const VAddr end_in_vma{start_in_vma + size};
+
+    ASSERT_MSG(end_in_vma <= vma.size, "region size 0x{:016X} is less than required size 0x{:016X}",
+               vma.size, end_in_vma);
+
+    if (end_in_vma < vma.size) {
+        // Split VMA at the end of the allocated region
+        SplitVMA(vma_handle, end_in_vma);
+    }
+    if (start_in_vma != 0) {
+        // Split VMA at the start of the allocated region
+        vma_handle = SplitVMA(vma_handle, start_in_vma);
+    }
+
+    return vma_handle;
+}
+
+MemoryManager::VMAIter MemoryManager::CarveVMARange(GPUVAddr target, u64 size) {
+    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size);
+    ASSERT_MSG((target & page_mask) == 0, "non-page aligned base: 0x{:016X}", target);
+
+    const VAddr target_end{target + size};
+    ASSERT(target_end >= target);
+    ASSERT(size > 0);
+
+    VMAIter begin_vma{StripIterConstness(FindVMA(target))};
+    const VMAIter i_end{vma_map.lower_bound(target_end)};
+    if (std::any_of(begin_vma, i_end, [](const auto& entry) {
+            return entry.second.type == VirtualMemoryArea::Type::Unmapped;
+        })) {
+        return {};
+    }
+
+    if (target != begin_vma->second.base) {
+        begin_vma = SplitVMA(begin_vma, target - begin_vma->second.base);
+    }
+
+    VMAIter end_vma{StripIterConstness(FindVMA(target_end))};
+    if (end_vma != vma_map.end() && target_end != end_vma->second.base) {
+        end_vma = SplitVMA(end_vma, target_end - end_vma->second.base);
+    }
+
+    return begin_vma;
+}
+
+MemoryManager::VMAIter MemoryManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) {
+    VirtualMemoryArea& old_vma{vma_handle->second};
+    VirtualMemoryArea new_vma{old_vma}; // Make a copy of the VMA
+
+    // For now, don't allow no-op VMA splits (trying to split at a boundary) because it's probably
+    // a bug. This restriction might be removed later.
+    ASSERT(offset_in_vma < old_vma.size);
+    ASSERT(offset_in_vma > 0);
+
+    old_vma.size = offset_in_vma;
+    new_vma.base += offset_in_vma;
+    new_vma.size -= offset_in_vma;
+
+    switch (new_vma.type) {
+    case VirtualMemoryArea::Type::Unmapped:
+        break;
+    case VirtualMemoryArea::Type::Allocated:
+        new_vma.offset += offset_in_vma;
+        break;
+    case VirtualMemoryArea::Type::Mapped:
+        new_vma.backing_memory += offset_in_vma;
+        break;
+    }
+
+    ASSERT(old_vma.CanBeMergedWith(new_vma));
+
+    return vma_map.emplace_hint(std::next(vma_handle), new_vma.base, new_vma);
+}
+
+MemoryManager::VMAIter MemoryManager::MergeAdjacent(VMAIter iter) {
+    const VMAIter next_vma{std::next(iter)};
+    if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) {
+        iter->second.size += next_vma->second.size;
+        vma_map.erase(next_vma);
+    }
+
+    if (iter != vma_map.begin()) {
+        VMAIter prev_vma{std::prev(iter)};
+        if (prev_vma->second.CanBeMergedWith(iter->second)) {
+            prev_vma->second.size += iter->second.size;
+            vma_map.erase(iter);
+            iter = prev_vma;
         }
     }
-    return results;
+
+    return iter;
 }
 
-VAddr& MemoryManager::PageSlot(GPUVAddr gpu_addr) {
-    auto& block{page_table[(gpu_addr >> (PAGE_BITS + PAGE_TABLE_BITS)) & PAGE_TABLE_MASK]};
-    if (!block) {
-        block = std::make_unique<PageBlock>();
-        block->fill(static_cast<VAddr>(PageStatus::Unmapped));
+void MemoryManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
+    switch (vma.type) {
+    case VirtualMemoryArea::Type::Unmapped:
+        UnmapRegion(vma.base, vma.size);
+        break;
+    case VirtualMemoryArea::Type::Allocated:
+        MapMemoryRegion(vma.base, vma.size, nullptr, vma.backing_addr);
+        break;
+    case VirtualMemoryArea::Type::Mapped:
+        MapMemoryRegion(vma.base, vma.size, vma.backing_memory, vma.backing_addr);
+        break;
     }
-    return (*block)[(gpu_addr >> PAGE_BITS) & PAGE_BLOCK_MASK];
 }
 
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index fb03497ca..647cbf93a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -1,67 +1,154 @@
-// Copyright 2018 yuzu emulator team
+// Copyright 2018 yuzu emulator team
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
 
-#include <array>
-#include <memory>
+#include <map>
 #include <optional>
-#include <vector>
 
 #include "common/common_types.h"
+#include "common/page_table.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
 
 namespace Tegra {
 
-/// Virtual addresses in the GPU's memory map are 64 bit.
-using GPUVAddr = u64;
+/**
+ * Represents a VMA in an address space. A VMA is a contiguous region of virtual addressing space
+ * with homogeneous attributes across its extents. In this particular implementation each VMA is
+ * also backed by a single host memory allocation.
+ */
+struct VirtualMemoryArea {
+    enum class Type : u8 {
+        Unmapped,
+        Allocated,
+        Mapped,
+    };
+
+    /// Virtual base address of the region.
+    GPUVAddr base{};
+    /// Size of the region.
+    u64 size{};
+    /// Memory area mapping type.
+    Type type{Type::Unmapped};
+    /// CPU memory mapped address corresponding to this memory area.
+    VAddr backing_addr{};
+    /// Offset into the backing_memory the mapping starts from.
+    std::size_t offset{};
+    /// Pointer backing this VMA.
+    u8* backing_memory{};
+
+    /// Tests if this area can be merged to the right with `next`.
+    bool CanBeMergedWith(const VirtualMemoryArea& next) const;
+};
 
 class MemoryManager final {
 public:
-    MemoryManager();
+    MemoryManager(VideoCore::RasterizerInterface& rasterizer);
 
     GPUVAddr AllocateSpace(u64 size, u64 align);
-    GPUVAddr AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align);
+    GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
     GPUVAddr MapBufferEx(VAddr cpu_addr, u64 size);
-    GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size);
-    GPUVAddr UnmapBuffer(GPUVAddr gpu_addr, u64 size);
-    GPUVAddr GetRegionEnd(GPUVAddr region_start) const;
-    std::optional<VAddr> GpuToCpuAddress(GPUVAddr gpu_addr);
-    std::vector<GPUVAddr> CpuToGpuAddress(VAddr cpu_addr) const;
+    GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr addr, u64 size);
+    GPUVAddr UnmapBuffer(GPUVAddr addr, u64 size);
+    std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
+
+    template <typename T>
+    T Read(GPUVAddr addr) const;
+
+    template <typename T>
+    void Write(GPUVAddr addr, T data);
 
-    static constexpr u64 PAGE_BITS = 16;
-    static constexpr u64 PAGE_SIZE = 1 << PAGE_BITS;
-    static constexpr u64 PAGE_MASK = PAGE_SIZE - 1;
+    u8* GetPointer(GPUVAddr addr);
+    const u8* GetPointer(GPUVAddr addr) const;
+
+    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
 
 private:
-    enum class PageStatus : u64 {
-        Unmapped = 0xFFFFFFFFFFFFFFFFULL,
-        Allocated = 0xFFFFFFFFFFFFFFFEULL,
-        Reserved = 0xFFFFFFFFFFFFFFFDULL,
-    };
+    using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
+    using VMAHandle = VMAMap::const_iterator;
+    using VMAIter = VMAMap::iterator;
 
-    std::optional<GPUVAddr> FindFreeBlock(GPUVAddr region_start, u64 size, u64 align,
-                                          PageStatus status);
-    VAddr& PageSlot(GPUVAddr gpu_addr);
-
-    static constexpr u64 MAX_ADDRESS{0x10000000000ULL};
-    static constexpr u64 PAGE_TABLE_BITS{10};
-    static constexpr u64 PAGE_TABLE_SIZE{1 << PAGE_TABLE_BITS};
-    static constexpr u64 PAGE_TABLE_MASK{PAGE_TABLE_SIZE - 1};
-    static constexpr u64 PAGE_BLOCK_BITS{14};
-    static constexpr u64 PAGE_BLOCK_SIZE{1 << PAGE_BLOCK_BITS};
-    static constexpr u64 PAGE_BLOCK_MASK{PAGE_BLOCK_SIZE - 1};
-
-    using PageBlock = std::array<VAddr, PAGE_BLOCK_SIZE>;
-    std::array<std::unique_ptr<PageBlock>, PAGE_TABLE_SIZE> page_table{};
-
-    struct MappedRegion {
-        VAddr cpu_addr;
-        GPUVAddr gpu_addr;
-        u64 size;
-    };
+    bool IsAddressValid(GPUVAddr addr) const;
+    void MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
+                  VAddr backing_addr = 0);
+    void MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr);
+    void UnmapRegion(GPUVAddr base, u64 size);
+
+    /// Finds the VMA in which the given address is included in, or `vma_map.end()`.
+    VMAHandle FindVMA(GPUVAddr target) const;
+
+    VMAHandle AllocateMemory(GPUVAddr target, std::size_t offset, u64 size);
+
+    /**
+     * Maps an unmanaged host memory pointer at a given address.
+     *
+     * @param target The guest address to start the mapping at.
+     * @param memory The memory to be mapped.
+     * @param size Size of the mapping.
+     * @param state MemoryState tag to attach to the VMA.
+     */
+    VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr);
+
+    /// Unmaps a range of addresses, splitting VMAs as necessary.
+    void UnmapRange(GPUVAddr target, u64 size);
+
+    /// Converts a VMAHandle to a mutable VMAIter.
+    VMAIter StripIterConstness(const VMAHandle& iter);
+
+    /// Marks as the specfied VMA as allocated.
+    VMAIter Allocate(VMAIter vma);
+
+    /**
+     * Carves a VMA of a specific size at the specified address by splitting Free VMAs while doing
+     * the appropriate error checking.
+     */
+    VMAIter CarveVMA(GPUVAddr base, u64 size);
+
+    /**
+     * Splits the edges of the given range of non-Free VMAs so that there is a VMA split at each
+     * end of the range.
+     */
+    VMAIter CarveVMARange(GPUVAddr base, u64 size);
+
+    /**
+     * Splits a VMA in two, at the specified offset.
+     * @returns the right side of the split, with the original iterator becoming the left side.
+     */
+    VMAIter SplitVMA(VMAIter vma, u64 offset_in_vma);
+
+    /**
+     * Checks for and merges the specified VMA with adjacent ones if possible.
+     * @returns the merged VMA or the original if no merging was possible.
+     */
+    VMAIter MergeAdjacent(VMAIter vma);
+
+    /// Updates the pages corresponding to this VMA so they match the VMA's attributes.
+    void UpdatePageTableForVMA(const VirtualMemoryArea& vma);
+
+    /// Finds a free (unmapped region) of the specified size starting at the specified address.
+    GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size) const;
+
+private:
+    static constexpr u64 page_bits{16};
+    static constexpr u64 page_size{1 << page_bits};
+    static constexpr u64 page_mask{page_size - 1};
+
+    /// Address space in bits, this is fairly arbitrary but sufficiently large.
+    static constexpr u32 address_space_width{39};
+    /// Start address for mapping, this is fairly arbitrary but must be non-zero.
+    static constexpr GPUVAddr address_space_base{0x100000};
+    /// End of address space, based on address space in bits.
+    static constexpr GPUVAddr address_space_end{1ULL << address_space_width};
 
-    std::vector<MappedRegion> mapped_regions;
+    Common::PageTable page_table{page_bits};
+    VMAMap vma_map;
+    VideoCore::RasterizerInterface& rasterizer;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index 9692ce143..3e91cbc83 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -6,7 +6,6 @@
 #include <cstring>
 #include "common/assert.h"
 #include "common/common_types.h"
-#include "core/memory.h"
 #include "video_core/morton.h"
 #include "video_core/surface.h"
 #include "video_core/textures/decoders.h"
@@ -16,12 +15,12 @@ namespace VideoCore {
 using Surface::GetBytesPerPixel;
 using Surface::PixelFormat;
 
-using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, VAddr);
+using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, u8*);
 using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>;
 
 template <bool morton_to_linear, PixelFormat format>
 static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth,
-                       u32 tile_width_spacing, u8* buffer, VAddr addr) {
+                       u32 tile_width_spacing, u8* buffer, u8* addr) {
     constexpr u32 bytes_per_pixel = GetBytesPerPixel(format);
 
     // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
@@ -34,10 +33,10 @@ static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth
                                          stride, height, depth, block_height, block_depth,
                                          tile_width_spacing);
     } else {
-        Tegra::Texture::CopySwizzledData(
-            (stride + tile_size_x - 1) / tile_size_x, (height + tile_size_y - 1) / tile_size_y,
-            depth, bytes_per_pixel, bytes_per_pixel, Memory::GetPointer(addr), buffer, false,
-            block_height, block_depth, tile_width_spacing);
+        Tegra::Texture::CopySwizzledData((stride + tile_size_x - 1) / tile_size_x,
+                                         (height + tile_size_y - 1) / tile_size_y, depth,
+                                         bytes_per_pixel, bytes_per_pixel, addr, buffer, false,
+                                         block_height, block_depth, tile_width_spacing);
     }
 }
 
@@ -282,7 +281,7 @@ static u32 GetMortonOffset128(u32 x, u32 y, u32 bytes_per_pixel) {
 
 void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride,
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, VAddr addr) {
+                   u8* buffer, u8* addr) {
     GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth,
                                      tile_width_spacing, buffer, addr);
 }
diff --git a/src/video_core/morton.h b/src/video_core/morton.h
index b565204b5..ee5b45555 100644
--- a/src/video_core/morton.h
+++ b/src/video_core/morton.h
@@ -13,7 +13,7 @@ enum class MortonSwizzleMode { MortonToLinear, LinearToMorton };
 
 void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride,
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, VAddr addr);
+                   u8* buffer, u8* addr);
 
 void MortonCopyPixels128(MortonSwizzleMode mode, u32 width, u32 height, u32 bytes_per_pixel,
                          u32 linear_bytes_per_pixel, u8* morton_data, u8* linear_data);
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index a7bcf26fb..291772186 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <mutex>
 #include <set>
 #include <unordered_map>
 
@@ -12,14 +13,26 @@
 
 #include "common/common_types.h"
 #include "core/settings.h"
+#include "video_core/gpu.h"
 #include "video_core/rasterizer_interface.h"
 
 class RasterizerCacheObject {
 public:
+    explicit RasterizerCacheObject(const u8* host_ptr)
+        : host_ptr{host_ptr}, cache_addr{ToCacheAddr(host_ptr)} {}
+
     virtual ~RasterizerCacheObject();
 
+    CacheAddr GetCacheAddr() const {
+        return cache_addr;
+    }
+
+    const u8* GetHostPtr() const {
+        return host_ptr;
+    }
+
     /// Gets the address of the shader in guest memory, required for cache management
-    virtual VAddr GetAddr() const = 0;
+    virtual VAddr GetCpuAddr() const = 0;
 
     /// Gets the size of the shader in guest memory, required for cache management
     virtual std::size_t GetSizeInBytes() const = 0;
@@ -58,6 +71,8 @@ private:
     bool is_registered{};      ///< Whether the object is currently registered with the cache
     bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
     u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
+    const u8* host_ptr{};      ///< Pointer to the memory backing this cached region
+    CacheAddr cache_addr{};    ///< Cache address memory, unique from emulated virtual address space
 };
 
 template <class T>
@@ -68,7 +83,9 @@ public:
     explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
 
     /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(Tegra::GPUVAddr addr, size_t size) {
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
         const auto& objects{GetSortedObjectsFromRegion(addr, size)};
         for (auto& object : objects) {
             FlushObject(object);
@@ -76,7 +93,9 @@ public:
     }
 
     /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
+    void InvalidateRegion(CacheAddr addr, u64 size) {
+        std::lock_guard lock{mutex};
+
         const auto& objects{GetSortedObjectsFromRegion(addr, size)};
         for (auto& object : objects) {
             if (!object->IsRegistered()) {
@@ -89,48 +108,60 @@ public:
 
     /// Invalidates everything in the cache
     void InvalidateAll() {
+        std::lock_guard lock{mutex};
+
         while (interval_cache.begin() != interval_cache.end()) {
             Unregister(*interval_cache.begin()->second.begin());
         }
     }
 
 protected:
-    /// Tries to get an object from the cache with the specified address
-    T TryGet(VAddr addr) const {
+    /// Tries to get an object from the cache with the specified cache address
+    T TryGet(CacheAddr addr) const {
         const auto iter = map_cache.find(addr);
         if (iter != map_cache.end())
             return iter->second;
         return nullptr;
     }
 
+    T TryGet(const void* addr) const {
+        const auto iter = map_cache.find(ToCacheAddr(addr));
+        if (iter != map_cache.end())
+            return iter->second;
+        return nullptr;
+    }
+
     /// Register an object into the cache
-    void Register(const T& object) {
+    virtual void Register(const T& object) {
+        std::lock_guard lock{mutex};
+
         object->SetIsRegistered(true);
         interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), 1);
+        map_cache.insert({object->GetCacheAddr(), object});
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
     }
 
     /// Unregisters an object from the cache
-    void Unregister(const T& object) {
-        object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1);
-        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
-        if (Settings::values.use_accurate_gpu_emulation) {
-            FlushObject(object);
-        }
+    virtual void Unregister(const T& object) {
+        std::lock_guard lock{mutex};
 
+        object->SetIsRegistered(false);
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
         interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(object->GetAddr());
+        map_cache.erase(object->GetCacheAddr());
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
     u64 GetModifiedTicks() {
+        std::lock_guard lock{mutex};
+
         return ++modified_ticks;
     }
 
     /// Flushes the specified object, updating appropriate cache state as needed
     void FlushObject(const T& object) {
+        std::lock_guard lock{mutex};
+
         if (!object->IsDirty()) {
             return;
         }
@@ -140,7 +171,7 @@ protected:
 
 private:
     /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
+    std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) {
         if (size == 0) {
             return {};
         }
@@ -164,17 +195,18 @@ private:
     }
 
     using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
+    using ObjectCache = std::unordered_map<CacheAddr, T>;
+    using IntervalCache = boost::icl::interval_map<CacheAddr, ObjectSet>;
     using ObjectInterval = typename IntervalCache::interval_type;
 
     static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetAddr(),
-                                          object->GetAddr() + object->GetSizeInBytes());
+        return ObjectInterval::right_open(object->GetCacheAddr(),
+                                          object->GetCacheAddr() + object->GetSizeInBytes());
     }
 
     ObjectCache map_cache;
     IntervalCache interval_cache; ///< Cache of objects
     u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
     VideoCore::RasterizerInterface& rasterizer;
+    std::recursive_mutex mutex;
 };
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6a1dc9cf6..d7b86df38 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -9,7 +9,6 @@
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
 
 namespace VideoCore {
 
@@ -35,14 +34,14 @@ public:
     virtual void FlushAll() = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     /// and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
     /// Attempt to use a faster method to perform a surface copy
     virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@@ -63,7 +62,7 @@ public:
     }
 
     /// Increase/decrease the number of object in pages touching the specified region
-    virtual void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {}
+    virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
 
     /// Initialize disk cached resources for the game being emulated
     virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index b3062e5ba..7989ec11b 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,30 +7,33 @@
 
 #include "common/alignment.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 
 namespace OpenGL {
 
+CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
+                                     std::size_t alignment, u8* host_ptr)
+    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
+      alignment{alignment} {}
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
     : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
 
-GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size,
-                                      std::size_t alignment, bool cache) {
+GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
+                                      bool cache) {
     auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
-    ASSERT_MSG(cpu_addr, "Invalid GPU address");
 
     // Cache management is a big overhead, so only cache entries with a given size.
     // TODO: Figure out which size is the best for given games.
     cache &= size >= 2048;
 
+    const auto& host_ptr{memory_manager.GetPointer(gpu_addr)};
     if (cache) {
-        auto entry = TryGet(*cpu_addr);
+        auto entry = TryGet(host_ptr);
         if (entry) {
-            if (entry->size >= size && entry->alignment == alignment) {
-                return entry->offset;
+            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
+                return entry->GetOffset();
             }
             Unregister(entry);
         }
@@ -39,17 +42,17 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
     AlignBuffer(alignment);
     const GLintptr uploaded_offset = buffer_offset;
 
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+    if (!host_ptr) {
+        return uploaded_offset;
+    }
 
+    std::memcpy(buffer_ptr, host_ptr, size);
     buffer_ptr += size;
     buffer_offset += size;
 
     if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>();
-        entry->offset = uploaded_offset;
-        entry->size = size;
-        entry->alignment = alignment;
-        entry->addr = *cpu_addr;
+        auto entry = std::make_shared<CachedBufferEntry>(
+            *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr);
         Register(entry);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index c11acfb79..fc33aa433 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -17,22 +17,39 @@ namespace OpenGL {
 
 class RasterizerOpenGL;
 
-struct CachedBufferEntry final : public RasterizerCacheObject {
-    VAddr GetAddr() const override {
-        return addr;
+class CachedBufferEntry final : public RasterizerCacheObject {
+public:
+    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
+                               std::size_t alignment, u8* host_ptr);
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
         return size;
     }
 
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    GLintptr GetOffset() const {
+        return offset;
+    }
+
+    std::size_t GetAlignment() const {
+        return alignment;
+    }
+
     // We do not have to flush this cache as things in it are never modified by us.
     void Flush() override {}
 
-    VAddr addr;
-    std::size_t size;
-    GLintptr offset;
-    std::size_t alignment;
+private:
+    VAddr cpu_addr{};
+    std::size_t size{};
+    GLintptr offset{};
+    std::size_t alignment{};
 };
 
 class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
@@ -41,7 +58,7 @@ public:
 
     /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
     /// allocated.
-    GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+    GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                           bool cache = true);
 
     /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index 7161d1dea..5842d6213 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -4,10 +4,8 @@
 
 #include <glad/glad.h>
 
-#include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -15,12 +13,13 @@
 
 namespace OpenGL {
 
-CachedGlobalRegion::CachedGlobalRegion(VAddr addr, u32 size) : addr{addr}, size{size} {
+CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
+    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} {
     buffer.Create();
     // Bind and unbind the buffer so it gets allocated by the driver
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
-    LabelGLObject(GL_BUFFER, buffer.handle, addr, "GlobalMemory");
+    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
 }
 
 void CachedGlobalRegion::Reload(u32 size_) {
@@ -35,10 +34,10 @@ void CachedGlobalRegion::Reload(u32 size_) {
 
     // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
-    glBufferData(GL_SHADER_STORAGE_BUFFER, size, Memory::GetPointer(addr), GL_DYNAMIC_DRAW);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
 }
 
-GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const {
+GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
     const auto search{reserve.find(addr)};
     if (search == reserve.end()) {
         return {};
@@ -46,11 +45,14 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32
     return search->second;
 }
 
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size) {
-    GlobalRegion region{TryGetReservedGlobalRegion(addr, size)};
+GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size,
+                                                              u8* host_ptr) {
+    GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
     if (!region) {
         // No reserved surface available, create a new one and reserve it
-        region = std::make_shared<CachedGlobalRegion>(addr, size);
+        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
+        const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr);
+        region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr);
         ReserveGlobalRegion(region);
     }
     region->Reload(size);
@@ -58,7 +60,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 si
 }
 
 void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
-    reserve.insert_or_assign(region->GetAddr(), std::move(region));
+    reserve.insert_or_assign(region->GetCacheAddr(), std::move(region));
 }
 
 GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
@@ -69,22 +71,20 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
     Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
 
     auto& gpu{Core::System::GetInstance().GPU()};
-    const auto cbufs = gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)];
-    const auto cbuf_addr = gpu.MemoryManager().GpuToCpuAddress(
-        cbufs.const_buffers[global_region.GetCbufIndex()].address + global_region.GetCbufOffset());
-    ASSERT(cbuf_addr);
-
-    const auto actual_addr_gpu = Memory::Read64(*cbuf_addr);
-    const auto size = Memory::Read32(*cbuf_addr + 8);
-    const auto actual_addr = gpu.MemoryManager().GpuToCpuAddress(actual_addr_gpu);
-    ASSERT(actual_addr);
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]};
+    const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
+                    global_region.GetCbufOffset()};
+    const auto actual_addr{memory_manager.Read<u64>(addr)};
+    const auto size{memory_manager.Read<u32>(addr + 8)};
 
     // Look up global region in the cache based on address
-    GlobalRegion region = TryGet(*actual_addr);
+    const auto& host_ptr{memory_manager.GetPointer(actual_addr)};
+    GlobalRegion region{TryGet(host_ptr)};
 
     if (!region) {
         // No global region found - create a new one
-        region = GetUncachedGlobalRegion(*actual_addr, size);
+        region = GetUncachedGlobalRegion(actual_addr, size, host_ptr);
         Register(region);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index ba2bdc60c..5a21ab66f 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -27,14 +27,12 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
 
 class CachedGlobalRegion final : public RasterizerCacheObject {
 public:
-    explicit CachedGlobalRegion(VAddr addr, u32 size);
+    explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
 
-    /// Gets the address of the shader in guest memory, required for cache management
-    VAddr GetAddr() const override {
-        return addr;
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
-    /// Gets the size of the shader in guest memory, required for cache management
     std::size_t GetSizeInBytes() const override {
         return size;
     }
@@ -53,9 +51,8 @@ public:
     }
 
 private:
-    VAddr addr{};
+    VAddr cpu_addr{};
     u32 size{};
-
     OGLBuffer buffer;
 };
 
@@ -68,11 +65,11 @@ public:
                                  Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
 
 private:
-    GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size);
+    GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
+    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr);
     void ReserveGlobalRegion(GlobalRegion region);
 
-    std::unordered_map<VAddr, GlobalRegion> reserve;
+    std::unordered_map<CacheAddr, GlobalRegion> reserve;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
index 77d5cedd2..c3e94d917 100644
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
@@ -7,7 +7,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/core.h"
-#include "core/memory.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_primitive_assembler.h"
 
@@ -40,16 +40,12 @@ GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) {
     return index_offset;
 }
 
-GLintptr PrimitiveAssembler::MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size,
-                                             u32 count) {
+GLintptr PrimitiveAssembler::MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count) {
     const std::size_t map_size{CalculateQuadSize(count)};
     auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size);
 
     auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
-    ASSERT_MSG(cpu_addr, "Invalid GPU address");
-
-    const u8* source{Memory::GetPointer(*cpu_addr)};
+    const u8* source{memory_manager.GetPointer(gpu_addr)};
 
     for (u32 primitive = 0; primitive < count / 4; ++primitive) {
         for (std::size_t i = 0; i < TRIANGLES_PER_QUAD; ++i) {
@@ -64,4 +60,4 @@ GLintptr PrimitiveAssembler::MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size
     return index_offset;
 }
 
-} // namespace OpenGL
-\ No newline at end of file
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h
index a8cb88eb5..4e87ce4d6 100644
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.h
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.h
@@ -4,11 +4,9 @@
 
 #pragma once
 
-#include <vector>
 #include <glad/glad.h>
 
 #include "common/common_types.h"
-#include "video_core/memory_manager.h"
 
 namespace OpenGL {
 
@@ -24,7 +22,7 @@ public:
 
     GLintptr MakeQuadArray(u32 first, u32 count);
 
-    GLintptr MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size, u32 count);
+    GLintptr MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count);
 
 private:
     OGLBufferCache& buffer_cache;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 976f64c24..7ff1e6737 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -17,7 +17,6 @@
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "core/core.h"
-#include "core/frontend/emu_window.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -26,7 +25,6 @@
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
-#include "video_core/video_core.h"
 
 namespace OpenGL {
 
@@ -100,11 +98,9 @@ struct FramebufferCacheKey {
     }
 };
 
-RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system,
-                                   ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this},
-      emu_window{window}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
+    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system},
+      screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -225,8 +221,8 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
         if (!vertex_array.IsEnabled())
             continue;
 
-        const Tegra::GPUVAddr start = vertex_array.StartAddress();
-        const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
+        const GPUVAddr start = vertex_array.StartAddress();
+        const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
         ASSERT(end > start);
         const u64 size = end - start + 1;
@@ -320,7 +316,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
-        ubo.SetFromRegs(gpu.state.shader_stages[stage]);
+        ubo.SetFromRegs(gpu, stage);
         const GLintptr offset = buffer_cache.UploadHostMemory(
             &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
 
@@ -421,8 +417,8 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
         if (!regs.vertex_array[index].IsEnabled())
             continue;
 
-        const Tegra::GPUVAddr start = regs.vertex_array[index].StartAddress();
-        const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
+        const GPUVAddr start = regs.vertex_array[index].StartAddress();
+        const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
         ASSERT(end > start);
         size += end - start + 1;
@@ -449,7 +445,7 @@ static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
     return boost::make_iterator_range(map.equal_range(interval));
 }
 
-void RasterizerOpenGL::UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {
+void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
     const u64 page_start{addr >> Memory::PAGE_BITS};
     const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
 
@@ -747,20 +743,26 @@ void RasterizerOpenGL::DrawArrays() {
 
 void RasterizerOpenGL::FlushAll() {}
 
-void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    if (!addr || !size) {
+        return;
+    }
     res_cache.FlushRegion(addr, size);
 }
 
-void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    if (!addr || !size) {
+        return;
+    }
     res_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
     global_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
 }
 
-void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     FlushRegion(addr, size);
     InvalidateRegion(addr, size);
 }
@@ -782,7 +784,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
 
-    const auto& surface{res_cache.TryFindFramebufferSurface(framebuffer_addr)};
+    const auto& surface{res_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))};
     if (!surface) {
         return {};
     }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index ca3de0592..54fbf48aa 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -12,15 +12,12 @@
 #include <optional>
 #include <tuple>
 #include <utility>
-#include <vector>
 
 #include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range.hpp>
 #include <glad/glad.h>
 
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -29,10 +26,8 @@
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
-#include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 namespace Core {
 class System;
@@ -50,16 +45,15 @@ struct FramebufferCacheKey;
 
 class RasterizerOpenGL : public VideoCore::RasterizerInterface {
 public:
-    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system,
-                              ScreenInfo& info);
+    explicit RasterizerOpenGL(Core::System& system, ScreenInfo& info);
     ~RasterizerOpenGL() override;
 
     void DrawArrays() override;
     void Clear() override;
     void FlushAll() override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(CacheAddr addr, u64 size) override;
+    void InvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                const Common::Rectangle<u32>& src_rect,
@@ -67,7 +61,7 @@ public:
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
     bool AccelerateDrawBatch(bool is_indexed) override;
-    void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) override;
+    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
@@ -214,7 +208,6 @@ private:
     ShaderCacheOpenGL shader_cache;
     GlobalRegionCacheOpenGL global_cache;
 
-    Core::Frontend::EmuWindow& emu_window;
     Core::System& system;
 
     ScreenInfo& screen_info;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index bd1409660..5876145ef 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -13,7 +13,6 @@
 #include "common/scope_exit.h"
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
-#include "core/memory.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/morton.h"
@@ -55,12 +54,11 @@ static void ApplyTextureDefaults(GLuint texture, u32 max_mip_level) {
     }
 }
 
-void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) {
+void SurfaceParams::InitCacheParameters(GPUVAddr gpu_addr_) {
     auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr_)};
 
-    addr = cpu_addr ? *cpu_addr : 0;
     gpu_addr = gpu_addr_;
+    host_ptr = memory_manager.GetPointer(gpu_addr_);
     size_in_bytes = SizeInBytesRaw();
 
     if (IsPixelFormatASTC(pixel_format)) {
@@ -223,7 +221,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,
 }
 
 /*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(
-    u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
+    u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format,
     u32 block_width, u32 block_height, u32 block_depth,
     Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) {
     SurfaceParams params{};
@@ -446,7 +444,7 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
             MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level),
                           params.MipBlockHeight(mip_level), params.MipHeight(mip_level),
                           params.MipBlockDepth(mip_level), 1, params.tile_width_spacing,
-                          gl_buffer.data() + offset_gl, params.addr + offset);
+                          gl_buffer.data() + offset_gl, params.host_ptr + offset);
             offset += layer_size;
             offset_gl += gl_size;
         }
@@ -455,7 +453,7 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
         MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level),
                       params.MipBlockHeight(mip_level), params.MipHeight(mip_level),
                       params.MipBlockDepth(mip_level), depth, params.tile_width_spacing,
-                      gl_buffer.data(), params.addr + offset);
+                      gl_buffer.data(), params.host_ptr + offset);
     }
 }
 
@@ -513,9 +511,9 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac
                               "reinterpretation but the texture is tiled.");
         }
         const std::size_t remaining_size = dst_params.size_in_bytes - src_params.size_in_bytes;
-
+        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
         glBufferSubData(GL_PIXEL_PACK_BUFFER, src_params.size_in_bytes, remaining_size,
-                        Memory::GetPointer(dst_params.addr + src_params.size_in_bytes));
+                        memory_manager.GetPointer(dst_params.gpu_addr + src_params.size_in_bytes));
     }
 
     glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -563,8 +561,14 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac
 }
 
 CachedSurface::CachedSurface(const SurfaceParams& params)
-    : params(params), gl_target(SurfaceTargetToGL(params.target)),
-      cached_size_in_bytes(params.size_in_bytes) {
+    : RasterizerCacheObject{params.host_ptr}, params{params},
+      gl_target{SurfaceTargetToGL(params.target)}, cached_size_in_bytes{params.size_in_bytes} {
+
+    const auto optional_cpu_addr{
+        Core::System::GetInstance().GPU().MemoryManager().GpuToCpuAddress(params.gpu_addr)};
+    ASSERT_MSG(optional_cpu_addr, "optional_cpu_addr is invalid");
+    cpu_addr = *optional_cpu_addr;
+
     texture.Create(gl_target);
 
     // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0)
@@ -603,19 +607,7 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
 
     ApplyTextureDefaults(texture.handle, params.max_mip_level);
 
-    OpenGL::LabelGLObject(GL_TEXTURE, texture.handle, params.addr, params.IdentityString());
-
-    // Clamp size to mapped GPU memory region
-    // TODO(bunnei): Super Mario Odyssey maps a 0x40000 byte region and then uses it for a 0x80000
-    // R32F render buffer. We do not yet know if this is a game bug or something else, but this
-    // check is necessary to prevent flushing from overwriting unmapped memory.
-
-    auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-    const u64 max_size{memory_manager.GetRegionEnd(params.gpu_addr) - params.gpu_addr};
-    if (cached_size_in_bytes > max_size) {
-        LOG_ERROR(HW_GPU, "Surface size {} exceeds region size {}", params.size_in_bytes, max_size);
-        cached_size_in_bytes = max_size;
-    }
+    OpenGL::LabelGLObject(GL_TEXTURE, texture.handle, params.gpu_addr, params.IdentityString());
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64));
@@ -633,10 +625,9 @@ void CachedSurface::LoadGLBuffer() {
         const u32 bpp = params.GetFormatBpp() / 8;
         const u32 copy_size = params.width * bpp;
         if (params.pitch == copy_size) {
-            std::memcpy(gl_buffer[0].data(), Memory::GetPointer(params.addr),
-                        params.size_in_bytes_gl);
+            std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
         } else {
-            const u8* start = Memory::GetPointer(params.addr);
+            const u8* start{params.host_ptr};
             u8* write_to = gl_buffer[0].data();
             for (u32 h = params.height; h > 0; h--) {
                 std::memcpy(write_to, start, copy_size);
@@ -670,8 +661,8 @@ void CachedSurface::FlushGLBuffer() {
     gl_buffer[0].resize(GetSizeInBytes());
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
-    // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
-    ASSERT(params.width * GetBytesPerPixel(params.pixel_format) % 4 == 0);
+    const u32 align = std::clamp(params.RowAlign(0), 1U, 8U);
+    glPixelStorei(GL_PACK_ALIGNMENT, align);
     glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width));
     ASSERT(!tuple.compressed);
     glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -680,8 +671,6 @@ void CachedSurface::FlushGLBuffer() {
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
     Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,
                                            params.height, params.depth, true, true);
-    const u8* const texture_src_data = Memory::GetPointer(params.addr);
-    ASSERT(texture_src_data);
     if (params.is_tiled) {
         ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",
                    params.block_width, static_cast<u32>(params.target));
@@ -691,9 +680,9 @@ void CachedSurface::FlushGLBuffer() {
         const u32 bpp = params.GetFormatBpp() / 8;
         const u32 copy_size = params.width * bpp;
         if (params.pitch == copy_size) {
-            std::memcpy(Memory::GetPointer(params.addr), gl_buffer[0].data(), GetSizeInBytes());
+            std::memcpy(params.host_ptr, gl_buffer[0].data(), GetSizeInBytes());
         } else {
-            u8* start = Memory::GetPointer(params.addr);
+            u8* start{params.host_ptr};
             const u8* read_to = gl_buffer[0].data();
             for (u32 h = params.height; h > 0; h--) {
                 std::memcpy(start, read_to, copy_size);
@@ -718,8 +707,8 @@ void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
 
-    // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
-    ASSERT(params.MipWidth(mip_map) * GetBytesPerPixel(params.pixel_format) % 4 == 0);
+    const u32 align = std::clamp(params.RowAlign(mip_map), 1U, 8U);
+    glPixelStorei(GL_UNPACK_ALIGNMENT, align);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.MipWidth(mip_map)));
 
     const auto image_size = static_cast<GLsizei>(params.GetMipmapSizeGL(mip_map, false));
@@ -927,12 +916,12 @@ void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
 }
 
 Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) {
-    if (params.addr == 0 || params.height * params.width == 0) {
+    if (!params.IsValid()) {
         return {};
     }
 
     // Look up surface in the cache based on address
-    Surface surface{TryGet(params.addr)};
+    Surface surface{TryGet(params.host_ptr)};
     if (surface) {
         if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
             // Use the cached surface as-is unless it's not synced with memory
@@ -943,7 +932,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
             // If surface parameters changed and we care about keeping the previous data, recreate
             // the surface from the old one
             Surface new_surface{RecreateSurface(surface, params)};
-            UnregisterSurface(surface);
+            Unregister(surface);
             Register(new_surface);
             if (new_surface->IsUploaded()) {
                 RegisterReinterpretSurface(new_surface);
@@ -951,7 +940,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
             return new_surface;
         } else {
             // Delete the old surface before creating a new one to prevent collisions.
-            UnregisterSurface(surface);
+            Unregister(surface);
         }
     }
 
@@ -981,14 +970,16 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
                                                    const Surface& dst_surface) {
     const auto& init_params{src_surface->GetSurfaceParams()};
     const auto& dst_params{dst_surface->GetSurfaceParams()};
-    VAddr address = init_params.addr;
-    const std::size_t layer_size = dst_params.LayerMemorySize();
+    auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
+    GPUVAddr address{init_params.gpu_addr};
+    const std::size_t layer_size{dst_params.LayerMemorySize()};
     for (u32 layer = 0; layer < dst_params.depth; layer++) {
         for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) {
-            const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap);
-            const Surface& copy = TryGet(sub_address);
-            if (!copy)
+            const GPUVAddr sub_address{address + dst_params.GetMipmapLevelOffset(mipmap)};
+            const Surface& copy{TryGet(memory_manager.GetPointer(sub_address))};
+            if (!copy) {
                 continue;
+            }
             const auto& src_params{copy->GetSurfaceParams()};
             const u32 width{std::min(src_params.width, dst_params.MipWidth(mipmap))};
             const u32 height{std::min(src_params.height, dst_params.MipHeight(mipmap))};
@@ -1163,7 +1154,8 @@ void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface,
     const auto& dst_params{dst_surface->GetSurfaceParams()};
 
     // Flush enough memory for both the source and destination surface
-    FlushRegion(src_params.addr, std::max(src_params.MemorySize(), dst_params.MemorySize()));
+    FlushRegion(ToCacheAddr(src_params.host_ptr),
+                std::max(src_params.MemorySize(), dst_params.MemorySize()));
 
     LoadSurface(dst_surface);
 }
@@ -1215,8 +1207,8 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
     return new_surface;
 }
 
-Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr addr) const {
-    return TryGet(addr);
+Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(const u8* host_ptr) const {
+    return TryGet(host_ptr);
 }
 
 void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) {
@@ -1243,9 +1235,9 @@ static std::optional<u32> TryFindBestMipMap(std::size_t memory, const SurfacePar
     return {};
 }
 
-static std::optional<u32> TryFindBestLayer(VAddr addr, const SurfaceParams params, u32 mipmap) {
-    const std::size_t size = params.LayerMemorySize();
-    VAddr start = params.addr + params.GetMipmapLevelOffset(mipmap);
+static std::optional<u32> TryFindBestLayer(GPUVAddr addr, const SurfaceParams params, u32 mipmap) {
+    const std::size_t size{params.LayerMemorySize()};
+    GPUVAddr start{params.gpu_addr + params.GetMipmapLevelOffset(mipmap)};
     for (u32 i = 0; i < params.depth; i++) {
         if (start == addr) {
             return {i};
@@ -1267,7 +1259,7 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
             src_params.height == dst_params.MipHeight(*level) &&
             src_params.block_height >= dst_params.MipBlockHeight(*level)) {
             const std::optional<u32> slot =
-                TryFindBestLayer(render_surface->GetAddr(), dst_params, *level);
+                TryFindBestLayer(render_surface->GetSurfaceParams().gpu_addr, dst_params, *level);
             if (slot.has_value()) {
                 glCopyImageSubData(render_surface->Texture().handle,
                                    SurfaceTargetToGL(src_params.target), 0, 0, 0, 0,
@@ -1283,8 +1275,8 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
 }
 
 static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) {
-    const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize();
-    const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize();
+    const VAddr bound1 = blitted_surface->GetCpuAddr() + blitted_surface->GetMemorySize();
+    const VAddr bound2 = render_surface->GetCpuAddr() + render_surface->GetMemorySize();
     if (bound2 > bound1)
         return true;
     const auto& dst_params = blitted_surface->GetSurfaceParams();
@@ -1302,12 +1294,12 @@ static bool IsReinterpretInvalidSecond(const Surface render_surface,
 bool RasterizerCacheOpenGL::PartialReinterpretSurface(Surface triggering_surface,
                                                       Surface intersect) {
     if (IsReinterpretInvalid(triggering_surface, intersect)) {
-        UnregisterSurface(intersect);
+        Unregister(intersect);
         return false;
     }
     if (!LayerFitReinterpretSurface(*this, triggering_surface, intersect)) {
         if (IsReinterpretInvalidSecond(triggering_surface, intersect)) {
-            UnregisterSurface(intersect);
+            Unregister(intersect);
             return false;
         }
         FlushObject(intersect);
@@ -1327,7 +1319,8 @@ void RasterizerCacheOpenGL::SignalPreDrawCall() {
 void RasterizerCacheOpenGL::SignalPostDrawCall() {
     for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {
         if (current_color_buffers[i] != nullptr) {
-            Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr());
+            Surface intersect =
+                CollideOnReinterpretedSurface(current_color_buffers[i]->GetCacheAddr());
             if (intersect != nullptr) {
                 PartialReinterpretSurface(current_color_buffers[i], intersect);
                 texception = true;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 9cf6f50be..db280dbb3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -5,13 +5,13 @@
 #pragma once
 
 #include <array>
-#include <map>
 #include <memory>
 #include <string>
-#include <unordered_set>
+#include <tuple>
 #include <vector>
 
 #include "common/alignment.h"
+#include "common/bit_util.h"
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "common/math_util.h"
@@ -109,6 +109,11 @@ struct SurfaceParams {
         return size;
     }
 
+    /// Returns true if the parameters constitute a valid rasterizer surface.
+    bool IsValid() const {
+        return gpu_addr && host_ptr && height && width;
+    }
+
     /// Returns the exact size of the memory occupied by a layer in a texture in VRAM, including
     /// mipmaps.
     std::size_t LayerMemorySize() const {
@@ -201,6 +206,13 @@ struct SurfaceParams {
         return bd;
     }
 
+    u32 RowAlign(u32 mip_level) const {
+        const u32 m_width = MipWidth(mip_level);
+        const u32 bytes_per_pixel = GetBytesPerPixel(pixel_format);
+        const u32 l2 = Common::CountTrailingZeroes32(m_width * bytes_per_pixel);
+        return (1U << l2);
+    }
+
     /// Creates SurfaceParams from a texture configuration
     static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config,
                                           const GLShader::SamplerEntry& entry);
@@ -210,7 +222,7 @@ struct SurfaceParams {
 
     /// Creates SurfaceParams for a depth buffer configuration
     static SurfaceParams CreateForDepthBuffer(
-        u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format,
+        u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format,
         u32 block_width, u32 block_height, u32 block_depth,
         Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type);
 
@@ -232,7 +244,7 @@ struct SurfaceParams {
     }
 
     /// Initializes parameters for caching, should be called after everything has been initialized
-    void InitCacheParameters(Tegra::GPUVAddr gpu_addr);
+    void InitCacheParameters(GPUVAddr gpu_addr);
 
     std::string TargetName() const {
         switch (target) {
@@ -296,8 +308,8 @@ struct SurfaceParams {
     bool is_array;
     bool srgb_conversion;
     // Parameters used for caching
-    VAddr addr;
-    Tegra::GPUVAddr gpu_addr;
+    u8* host_ptr;
+    GPUVAddr gpu_addr;
     std::size_t size_in_bytes;
     std::size_t size_in_bytes_gl;
 
@@ -345,10 +357,10 @@ class RasterizerOpenGL;
 
 class CachedSurface final : public RasterizerCacheObject {
 public:
-    CachedSurface(const SurfaceParams& params);
+    explicit CachedSurface(const SurfaceParams& params);
 
-    VAddr GetAddr() const override {
-        return params.addr;
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
@@ -432,6 +444,7 @@ private:
     std::size_t memory_size;
     bool reinterpreted = false;
     bool must_reload = false;
+    VAddr cpu_addr{};
 };
 
 class RasterizerCacheOpenGL final : public RasterizerCache<Surface> {
@@ -449,7 +462,7 @@ public:
     Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);
 
     /// Tries to find a framebuffer using on the provided CPU address
-    Surface TryFindFramebufferSurface(VAddr addr) const;
+    Surface TryFindFramebufferSurface(const u8* host_ptr) const;
 
     /// Copies the contents of one surface to another
     void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
@@ -506,12 +519,12 @@ private:
     std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
     Surface last_depth_buffer;
 
-    using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>;
+    using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;
     using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
 
     static auto GetReinterpretInterval(const Surface& object) {
-        return SurfaceInterval::right_open(object->GetAddr() + 1,
-                                           object->GetAddr() + object->GetMemorySize() - 1);
+        return SurfaceInterval::right_open(object->GetCacheAddr() + 1,
+                                           object->GetCacheAddr() + object->GetMemorySize() - 1);
     }
 
     // Reinterpreted surfaces are very fragil as the game may keep rendering into them.
@@ -523,7 +536,7 @@ private:
         reinterpret_surface->MarkReinterpreted();
     }
 
-    Surface CollideOnReinterpretedSurface(VAddr addr) const {
+    Surface CollideOnReinterpretedSurface(CacheAddr addr) const {
         const SurfaceInterval interval{addr};
         for (auto& pair :
              boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) {
@@ -532,13 +545,17 @@ private:
         return nullptr;
     }
 
+    void Register(const Surface& object) override {
+        RasterizerCache<Surface>::Register(object);
+    }
+
     /// Unregisters an object from the cache
-    void UnregisterSurface(const Surface& object) {
+    void Unregister(const Surface& object) override {
         if (object->IsReinterpreted()) {
             auto interval = GetReinterpretInterval(object);
             reinterpreted_surfaces.erase(interval);
         }
-        Unregister(object);
+        RasterizerCache<Surface>::Unregister(object);
     }
 };
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4883e4f62..ab381932c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -6,13 +6,11 @@
 #include "common/assert.h"
 #include "common/hash.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/utils.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -32,19 +30,20 @@ struct UnspecializedShader {
 namespace {
 
 /// Gets the address for the specified shader stage program
-VAddr GetShaderAddress(Maxwell::ShaderProgram program) {
-    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
-    const auto& shader_config = gpu.regs.shader_config[static_cast<std::size_t>(program)];
-    const auto address = gpu.memory_manager.GpuToCpuAddress(gpu.regs.code_address.CodeAddress() +
-                                                            shader_config.offset);
-    ASSERT_MSG(address, "Invalid GPU address");
-    return *address;
+GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
+    const auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()};
+    const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
+    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
 }
 
 /// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(VAddr addr) {
+ProgramCode GetShaderCode(const u8* host_ptr) {
     ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
-    Memory::ReadBlock(addr, program_code.data(), program_code.size() * sizeof(u64));
+    ASSERT_OR_EXECUTE(host_ptr != nullptr, {
+        std::fill(program_code.begin(), program_code.end(), 0);
+        return program_code;
+    });
+    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
     return program_code;
 }
 
@@ -214,12 +213,13 @@ std::set<GLenum> GetSupportedFormats() {
 
 } // namespace
 
-CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                           ShaderDiskCacheOpenGL& disk_cache,
+CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
+                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
-                           ProgramCode&& program_code, ProgramCode&& program_code_b)
-    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type},
-      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} {
+                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
+    : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr},
+      unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache},
+      precompiled_programs{precompiled_programs} {
 
     const std::size_t code_size = CalculateProgramSize(program_code);
     const std::size_t code_size_b =
@@ -243,12 +243,13 @@ CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderPro
     disk_cache.SaveRaw(raw);
 }
 
-CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                           ShaderDiskCacheOpenGL& disk_cache,
+CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
+                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
-                           GLShader::ProgramResult result)
-    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type},
-      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} {
+                           GLShader::ProgramResult result, u8* host_ptr)
+    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier},
+      program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{
+                                                              precompiled_programs} {
 
     code = std::move(result.first);
     entries = result.second;
@@ -271,7 +272,7 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive
                 disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
             }
 
-            LabelGLObject(GL_PROGRAM, program->handle, addr);
+            LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
         }
 
         handle = program->handle;
@@ -323,7 +324,7 @@ GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBind
         disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
     }
 
-    LabelGLObject(GL_PROGRAM, target_program->handle, addr, debug_name);
+    LabelGLObject(GL_PROGRAM, target_program->handle, cpu_addr, debug_name);
 
     return target_program->handle;
 };
@@ -486,29 +487,32 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         return last_shaders[static_cast<u32>(program)];
     }
 
-    const VAddr program_addr{GetShaderAddress(program)};
+    auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
+    const GPUVAddr program_addr{GetShaderAddress(program)};
 
     // Look up shader in the cache based on address
-    Shader shader{TryGet(program_addr)};
+    const auto& host_ptr{memory_manager.GetPointer(program_addr)};
+    Shader shader{TryGet(host_ptr)};
 
     if (!shader) {
         // No shader found - create a new one
-        ProgramCode program_code = GetShaderCode(program_addr);
+        ProgramCode program_code{GetShaderCode(host_ptr)};
         ProgramCode program_code_b;
         if (program == Maxwell::ShaderProgram::VertexA) {
-            program_code_b = GetShaderCode(GetShaderAddress(Maxwell::ShaderProgram::VertexB));
+            program_code_b = GetShaderCode(
+                memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
         }
         const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
-
+        const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
         const auto found = precompiled_shaders.find(unique_identifier);
         if (found != precompiled_shaders.end()) {
             shader =
-                std::make_shared<CachedShader>(program_addr, unique_identifier, program, disk_cache,
-                                               precompiled_programs, found->second);
+                std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache,
+                                               precompiled_programs, found->second, host_ptr);
         } else {
             shader = std::make_shared<CachedShader>(
-                program_addr, unique_identifier, program, disk_cache, precompiled_programs,
-                std::move(program_code), std::move(program_code_b));
+                cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+                std::move(program_code), std::move(program_code_b), host_ptr);
         }
         Register(shader);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 97eed192f..0cf8e0b3d 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -5,21 +5,20 @@
 #pragma once
 
 #include <array>
+#include <atomic>
 #include <memory>
 #include <set>
 #include <tuple>
 #include <unordered_map>
+#include <vector>
 
 #include <glad/glad.h>
 
-#include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
 
 namespace Core {
 class System;
@@ -39,18 +38,18 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
 
 class CachedShader final : public RasterizerCacheObject {
 public:
-    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                          ShaderDiskCacheOpenGL& disk_cache,
+    explicit CachedShader(VAddr cpu_addr, u64 unique_identifier,
+                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
-                          ProgramCode&& program_code, ProgramCode&& program_code_b);
+                          ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
 
-    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type,
-                          ShaderDiskCacheOpenGL& disk_cache,
+    explicit CachedShader(VAddr cpu_addr, u64 unique_identifier,
+                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
-                          GLShader::ProgramResult result);
+                          GLShader::ProgramResult result, u8* host_ptr);
 
-    VAddr GetAddr() const override {
-        return addr;
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
@@ -91,7 +90,8 @@ private:
 
     ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const;
 
-    VAddr addr{};
+    u8* host_ptr{};
+    VAddr cpu_addr{};
     u64 unique_identifier{};
     Maxwell::ShaderProgram program_type{};
     ShaderDiskCacheOpenGL& disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 11d1169f0..3ea08ef7b 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -21,6 +21,8 @@
 
 namespace OpenGL::GLShader {
 
+namespace {
+
 using Tegra::Shader::Attribute;
 using Tegra::Shader::AttributeUse;
 using Tegra::Shader::Header;
@@ -34,14 +36,18 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;
 
+enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
+
+struct TextureAoffi {};
+using TextureArgument = std::pair<Type, Node>;
+using TextureIR = std::variant<TextureAoffi, TextureArgument>;
+
 enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
 constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
 
-enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
-
 class ShaderWriter {
 public:
     void AddExpression(std::string_view text) {
@@ -69,10 +75,10 @@ public:
         shader_source += '\n';
     }
 
-    std::string GenerateTemporal() {
-        std::string temporal = "tmp";
-        temporal += std::to_string(temporal_index++);
-        return temporal;
+    std::string GenerateTemporary() {
+        std::string temporary = "tmp";
+        temporary += std::to_string(temporary_index++);
+        return temporary;
     }
 
     std::string GetResult() {
@@ -87,11 +93,11 @@ private:
     }
 
     std::string shader_source;
-    u32 temporal_index = 1;
+    u32 temporary_index = 1;
 };
 
 /// Generates code to use for a swizzle operation.
-static std::string GetSwizzle(u32 elem) {
+std::string GetSwizzle(u32 elem) {
     ASSERT(elem <= 3);
     std::string swizzle = ".";
     swizzle += "xyzw"[elem];
@@ -99,7 +105,7 @@ static std::string GetSwizzle(u32 elem) {
 }
 
 /// Translate topology
-static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
+std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
     switch (topology) {
     case Tegra::Shader::OutputTopology::PointList:
         return "points";
@@ -114,7 +120,7 @@ static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
 }
 
 /// Returns true if an object has to be treated as precise
-static bool IsPrecise(Operation operand) {
+bool IsPrecise(Operation operand) {
     const auto& meta = operand.GetMeta();
 
     if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
@@ -126,7 +132,7 @@ static bool IsPrecise(Operation operand) {
     return false;
 }
 
-static bool IsPrecise(Node node) {
+bool IsPrecise(Node node) {
     if (const auto operation = std::get_if<OperationNode>(node)) {
         return IsPrecise(*operation);
     }
@@ -426,9 +432,14 @@ private:
     std::string Visit(Node node) {
         if (const auto operation = std::get_if<OperationNode>(node)) {
             const auto operation_index = static_cast<std::size_t>(operation->GetCode());
+            if (operation_index >= operation_decompilers.size()) {
+                UNREACHABLE_MSG("Out of bounds operation: {}", operation_index);
+                return {};
+            }
             const auto decompiler = operation_decompilers[operation_index];
             if (decompiler == nullptr) {
-                UNREACHABLE_MSG("Operation decompiler {} not defined", operation_index);
+                UNREACHABLE_MSG("Undefined operation: {}", operation_index);
+                return {};
             }
             return (this->*decompiler)(*operation);
 
@@ -540,7 +551,7 @@ private:
 
             } else if (std::holds_alternative<OperationNode>(*offset)) {
                 // Indirect access
-                const std::string final_offset = code.GenerateTemporal();
+                const std::string final_offset = code.GenerateTemporary();
                 code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4) & " +
                              std::to_string(MAX_CONSTBUFFER_ELEMENTS - 1) + ';');
                 return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()),
@@ -587,9 +598,9 @@ private:
         // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
         const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
 
-        const std::string temporal = code.GenerateTemporal();
-        code.AddLine(precise + "float " + temporal + " = " + value + ';');
-        return temporal;
+        const std::string temporary = code.GenerateTemporary();
+        code.AddLine(precise + "float " + temporary + " = " + value + ';');
+        return temporary;
     }
 
     std::string VisitOperand(Operation operation, std::size_t operand_index) {
@@ -601,9 +612,9 @@ private:
             return Visit(operand);
         }
 
-        const std::string temporal = code.GenerateTemporal();
-        code.AddLine("float " + temporal + " = " + Visit(operand) + ';');
-        return temporal;
+        const std::string temporary = code.GenerateTemporary();
+        code.AddLine("float " + temporary + " = " + Visit(operand) + ';');
+        return temporary;
     }
 
     std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
@@ -718,8 +729,8 @@ private:
                                                          result_type));
     }
 
-    std::string GenerateTexture(Operation operation, const std::string& func,
-                                const std::vector<std::pair<Type, Node>>& extras) {
+    std::string GenerateTexture(Operation operation, const std::string& function_suffix,
+                                const std::vector<TextureIR>& extras) {
         constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"};
 
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
@@ -729,11 +740,11 @@ private:
         const bool has_array = meta->sampler.IsArray();
         const bool has_shadow = meta->sampler.IsShadow();
 
-        std::string expr = func;
-        expr += '(';
-        expr += GetSampler(meta->sampler);
-        expr += ", ";
-
+        std::string expr = "texture" + function_suffix;
+        if (!meta->aoffi.empty()) {
+            expr += "Offset";
+        }
+        expr += '(' + GetSampler(meta->sampler) + ", ";
         expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
         expr += '(';
         for (std::size_t i = 0; i < count; ++i) {
@@ -751,36 +762,74 @@ private:
         }
         expr += ')';
 
-        for (const auto& extra_pair : extras) {
-            const auto [type, operand] = extra_pair;
-            if (operand == nullptr) {
-                continue;
+        for (const auto& variant : extras) {
+            if (const auto argument = std::get_if<TextureArgument>(&variant)) {
+                expr += GenerateTextureArgument(*argument);
+            } else if (std::get_if<TextureAoffi>(&variant)) {
+                expr += GenerateTextureAoffi(meta->aoffi);
+            } else {
+                UNREACHABLE();
             }
-            expr += ", ";
+        }
 
-            switch (type) {
-            case Type::Int:
-                if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
-                    // Inline the string as an immediate integer in GLSL (some extra arguments are
-                    // required to be constant)
-                    expr += std::to_string(static_cast<s32>(immediate->GetValue()));
-                } else {
-                    expr += "ftoi(" + Visit(operand) + ')';
-                }
-                break;
-            case Type::Float:
-                expr += Visit(operand);
-                break;
-            default: {
-                const auto type_int = static_cast<u32>(type);
-                UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
-                expr += '0';
-                break;
+        return expr + ')';
+    }
+
+    std::string GenerateTextureArgument(TextureArgument argument) {
+        const auto [type, operand] = argument;
+        if (operand == nullptr) {
+            return {};
+        }
+
+        std::string expr = ", ";
+        switch (type) {
+        case Type::Int:
+            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+                // Inline the string as an immediate integer in GLSL (some extra arguments are
+                // required to be constant)
+                expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+            } else {
+                expr += "ftoi(" + Visit(operand) + ')';
             }
+            break;
+        case Type::Float:
+            expr += Visit(operand);
+            break;
+        default: {
+            const auto type_int = static_cast<u32>(type);
+            UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
+            expr += '0';
+            break;
+        }
+        }
+        return expr;
+    }
+
+    std::string GenerateTextureAoffi(const std::vector<Node>& aoffi) {
+        if (aoffi.empty()) {
+            return {};
+        }
+        constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"};
+        std::string expr = ", ";
+        expr += coord_constructors.at(aoffi.size() - 1);
+        expr += '(';
+
+        for (std::size_t index = 0; index < aoffi.size(); ++index) {
+            const auto operand{aoffi.at(index)};
+            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+                // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
+                // to be constant by the standard).
+                expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+            } else {
+                expr += "ftoi(" + Visit(operand) + ')';
+            }
+            if (index + 1 < aoffi.size()) {
+                expr += ", ";
             }
         }
+        expr += ')';
 
-        return expr + ')';
+        return expr;
     }
 
     std::string Assign(Operation operation) {
@@ -1159,7 +1208,8 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(operation, "texture", {{Type::Float, meta->bias}});
+        std::string expr = GenerateTexture(
+            operation, "", {TextureAoffi{}, TextureArgument{Type::Float, meta->bias}});
         if (meta->sampler.IsShadow()) {
             expr = "vec4(" + expr + ')';
         }
@@ -1170,7 +1220,8 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(operation, "textureLod", {{Type::Float, meta->lod}});
+        std::string expr = GenerateTexture(
+            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureAoffi{}});
         if (meta->sampler.IsShadow()) {
             expr = "vec4(" + expr + ')';
         }
@@ -1182,7 +1233,8 @@ private:
         ASSERT(meta);
 
         const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int;
-        return GenerateTexture(operation, "textureGather", {{type, meta->component}}) +
+        return GenerateTexture(operation, "Gather",
+                               {TextureArgument{type, meta->component}, TextureAoffi{}}) +
                GetSwizzle(meta->element);
     }
 
@@ -1196,11 +1248,12 @@ private:
         switch (meta->element) {
         case 0:
         case 1:
-            return "textureSize(" + sampler + ", " + lod + ')' + GetSwizzle(meta->element);
+            return "itof(int(textureSize(" + sampler + ", " + lod + ')' +
+                   GetSwizzle(meta->element) + "))";
         case 2:
             return "0";
         case 3:
-            return "textureQueryLevels(" + sampler + ')';
+            return "itof(textureQueryLevels(" + sampler + "))";
         }
         UNREACHABLE();
         return "0";
@@ -1211,8 +1264,8 @@ private:
         ASSERT(meta);
 
         if (meta->element < 2) {
-            return "itof(int((" + GenerateTexture(operation, "textureQueryLod", {}) +
-                   " * vec2(256))" + GetSwizzle(meta->element) + "))";
+            return "itof(int((" + GenerateTexture(operation, "QueryLod", {}) + " * vec2(256))" +
+                   GetSwizzle(meta->element) + "))";
         }
         return "0";
     }
@@ -1565,6 +1618,8 @@ private:
     ShaderWriter code;
 };
 
+} // Anonymous namespace
+
 std::string GetCommonDeclarations() {
     const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
     const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 72aca4938..4e04ab2f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -5,7 +5,6 @@
 #pragma once
 
 #include <array>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 82fc4d44b..8a43eb157 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -4,7 +4,6 @@
 
 #include <cstring>
 #include <fmt/format.h>
-#include <lz4.h>
 
 #include "common/assert.h"
 #include "common/common_paths.h"
@@ -12,6 +11,7 @@
 #include "common/file_util.h"
 #include "common/logging/log.h"
 #include "common/scm_rev.h"
+#include "common/zstd_compression.h"
 
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
@@ -49,39 +49,6 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
     return hash;
 }
 
-template <typename T>
-std::vector<u8> CompressData(const T* source, std::size_t source_size) {
-    if (source_size > LZ4_MAX_INPUT_SIZE) {
-        // Source size exceeds LZ4 maximum input size
-        return {};
-    }
-    const auto source_size_int = static_cast<int>(source_size);
-    const int max_compressed_size = LZ4_compressBound(source_size_int);
-    std::vector<u8> compressed(max_compressed_size);
-    const int compressed_size = LZ4_compress_default(reinterpret_cast<const char*>(source),
-                                                     reinterpret_cast<char*>(compressed.data()),
-                                                     source_size_int, max_compressed_size);
-    if (compressed_size <= 0) {
-        // Compression failed
-        return {};
-    }
-    compressed.resize(compressed_size);
-    return compressed;
-}
-
-std::vector<u8> DecompressData(const std::vector<u8>& compressed, std::size_t uncompressed_size) {
-    std::vector<u8> uncompressed(uncompressed_size);
-    const int size_check = LZ4_decompress_safe(reinterpret_cast<const char*>(compressed.data()),
-                                               reinterpret_cast<char*>(uncompressed.data()),
-                                               static_cast<int>(compressed.size()),
-                                               static_cast<int>(uncompressed.size()));
-    if (static_cast<int>(uncompressed_size) != size_check) {
-        // Decompression failed
-        return {};
-    }
-    return uncompressed;
-}
-
 } // namespace
 
 ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
@@ -292,7 +259,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
                 return {};
             }
 
-            dump.binary = DecompressData(compressed_binary, binary_length);
+            dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary);
             if (dump.binary.empty()) {
                 return {};
             }
@@ -321,7 +288,7 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
         return {};
     }
 
-    const std::vector<u8> code = DecompressData(compressed_code, code_size);
+    const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code);
     if (code.empty()) {
         return {};
     }
@@ -507,7 +474,8 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str
     if (!IsUsable())
         return;
 
-    const std::vector<u8> compressed_code{CompressData(code.data(), code.size())};
+    const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault(
+        reinterpret_cast<const u8*>(code.data()), code.size())};
     if (compressed_code.empty()) {
         LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}",
                   unique_identifier);
@@ -537,7 +505,9 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
     std::vector<u8> binary(binary_length);
     glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());
 
-    const std::vector<u8> compressed_binary = CompressData(binary.data(), binary.size());
+    const std::vector<u8> compressed_binary =
+        Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size());
+
     if (compressed_binary.empty()) {
         LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}",
                   usage.unique_identifier);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 7d96649af..8763d9c71 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -3,7 +3,6 @@
 // Refer to the license.txt file included.
 
 #include <fmt/format.h>
-#include "common/assert.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fba8e681b..fad346b48 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -4,12 +4,9 @@
 
 #pragma once
 
-#include <array>
-#include <string>
 #include <vector>
 
 #include "common/common_types.h"
-#include "video_core/engines/shader_bytecode.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
 
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 6a30c28d2..eaf3e03a0 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -2,15 +2,15 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include "core/core.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 
 namespace OpenGL::GLShader {
 
-void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) {
-    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-    const auto& state = gpu.state;
+using Tegra::Engines::Maxwell3D;
+
+void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shader_stage) {
+    const auto& regs = maxwell.regs;
+    const auto& state = maxwell.state;
 
     // TODO(bunnei): Support more than one viewport
     viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
@@ -18,7 +18,7 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh
 
     u32 func = static_cast<u32>(regs.alpha_test_func);
     // Normalize the gl variants of opCompare to be the same as the normal variants
-    u32 op_gl_variant_base = static_cast<u32>(Tegra::Engines::Maxwell3D::Regs::ComparisonOp::Never);
+    const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never);
     if (func >= op_gl_variant_base) {
         func = func - op_gl_variant_base + 1U;
     }
@@ -31,8 +31,9 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh
 
     // Assign in which stage the position has to be flipped
     // (the last stage before the fragment shader).
-    if (gpu.regs.shader_config[static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry)].enable) {
-        flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry);
+    constexpr u32 geometry_index = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry);
+    if (maxwell.regs.shader_config[geometry_index].enable) {
+        flip_stage = geometry_index;
     } else {
         flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 4970aafed..37dcfefdb 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -12,14 +12,13 @@
 
 namespace OpenGL::GLShader {
 
-using Tegra::Engines::Maxwell3D;
-
 /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
-// NOTE: Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
-//       the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
-//       Not following that rule will cause problems on some AMD drivers.
+/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
+///       the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
+///       Not following that rule will cause problems on some AMD drivers.
 struct MaxwellUniformData {
-    void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage);
+    void SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell, std::size_t shader_stage);
+
     alignas(16) GLvec4 viewport_flip;
     struct alignas(16) {
         GLuint instance_id;
@@ -63,7 +62,6 @@ public:
         UpdatePipeline();
         state.draw.shader_program = 0;
         state.draw.program_pipeline = pipeline.handle;
-        state.geometry_shaders.enabled = (gs != 0);
     }
 
 private:
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 9419326a3..52d569a1b 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -10,16 +10,62 @@
 
 namespace OpenGL {
 
-OpenGLState OpenGLState::cur_state;
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
+OpenGLState OpenGLState::cur_state;
 bool OpenGLState::s_rgb_used;
 
+namespace {
+
+template <typename T>
+bool UpdateValue(T& current_value, const T new_value) {
+    const bool changed = current_value != new_value;
+    current_value = new_value;
+    return changed;
+}
+
+template <typename T1, typename T2>
+bool UpdateTie(T1 current_value, const T2 new_value) {
+    const bool changed = current_value != new_value;
+    current_value = new_value;
+    return changed;
+}
+
+void Enable(GLenum cap, bool enable) {
+    if (enable) {
+        glEnable(cap);
+    } else {
+        glDisable(cap);
+    }
+}
+
+void Enable(GLenum cap, GLuint index, bool enable) {
+    if (enable) {
+        glEnablei(cap, index);
+    } else {
+        glDisablei(cap, index);
+    }
+}
+
+void Enable(GLenum cap, bool& current_value, bool new_value) {
+    if (UpdateValue(current_value, new_value))
+        Enable(cap, new_value);
+}
+
+void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) {
+    if (UpdateValue(current_value, new_value))
+        Enable(cap, index, new_value);
+}
+
+} // namespace
+
 OpenGLState::OpenGLState() {
     // These all match default OpenGL values
-    geometry_shaders.enabled = false;
     framebuffer_srgb.enabled = false;
+
     multisample_control.alpha_to_coverage = false;
     multisample_control.alpha_to_one = false;
+
     cull.enabled = false;
     cull.mode = GL_BACK;
     cull.front_face = GL_CCW;
@@ -30,14 +76,15 @@ OpenGLState::OpenGLState() {
 
     primitive_restart.enabled = false;
     primitive_restart.index = 0;
+
     for (auto& item : color_mask) {
         item.red_enabled = GL_TRUE;
         item.green_enabled = GL_TRUE;
         item.blue_enabled = GL_TRUE;
         item.alpha_enabled = GL_TRUE;
     }
-    stencil.test_enabled = false;
-    auto reset_stencil = [](auto& config) {
+
+    const auto ResetStencil = [](auto& config) {
         config.test_func = GL_ALWAYS;
         config.test_ref = 0;
         config.test_mask = 0xFFFFFFFF;
@@ -46,8 +93,10 @@ OpenGLState::OpenGLState() {
         config.action_depth_pass = GL_KEEP;
         config.action_stencil_fail = GL_KEEP;
     };
-    reset_stencil(stencil.front);
-    reset_stencil(stencil.back);
+    stencil.test_enabled = false;
+    ResetStencil(stencil.front);
+    ResetStencil(stencil.back);
+
     for (auto& item : viewports) {
         item.x = 0;
         item.y = 0;
@@ -61,6 +110,7 @@ OpenGLState::OpenGLState() {
         item.scissor.width = 0;
         item.scissor.height = 0;
     }
+
     for (auto& item : blend) {
         item.enabled = true;
         item.rgb_equation = GL_FUNC_ADD;
@@ -70,11 +120,14 @@ OpenGLState::OpenGLState() {
         item.src_a_func = GL_ONE;
         item.dst_a_func = GL_ZERO;
     }
+
     independant_blend.enabled = false;
+
     blend_color.red = 0.0f;
     blend_color.green = 0.0f;
     blend_color.blue = 0.0f;
     blend_color.alpha = 0.0f;
+
     logic_op.enabled = false;
     logic_op.operation = GL_COPY;
 
@@ -91,9 +144,12 @@ OpenGLState::OpenGLState() {
     clip_distance = {};
 
     point.size = 1;
+
     fragment_color_clamp.enabled = false;
+
     depth_clamp.far_plane = false;
     depth_clamp.near_plane = false;
+
     polygon_offset.fill_enable = false;
     polygon_offset.line_enable = false;
     polygon_offset.point_enable = false;
@@ -103,260 +159,255 @@ OpenGLState::OpenGLState() {
 }
 
 void OpenGLState::ApplyDefaultState() {
+    glEnable(GL_BLEND);
     glDisable(GL_FRAMEBUFFER_SRGB);
     glDisable(GL_CULL_FACE);
     glDisable(GL_DEPTH_TEST);
     glDisable(GL_PRIMITIVE_RESTART);
     glDisable(GL_STENCIL_TEST);
-    glEnable(GL_BLEND);
     glDisable(GL_COLOR_LOGIC_OP);
     glDisable(GL_SCISSOR_TEST);
 }
 
+void OpenGLState::ApplyFramebufferState() const {
+    if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) {
+        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
+    }
+    if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) {
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
+    }
+}
+
+void OpenGLState::ApplyVertexArrayState() const {
+    if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) {
+        glBindVertexArray(draw.vertex_array);
+    }
+}
+
+void OpenGLState::ApplyShaderProgram() const {
+    if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) {
+        glUseProgram(draw.shader_program);
+    }
+}
+
+void OpenGLState::ApplyProgramPipeline() const {
+    if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) {
+        glBindProgramPipeline(draw.program_pipeline);
+    }
+}
+
+void OpenGLState::ApplyClipDistances() const {
+    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
+        Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i],
+               clip_distance[i]);
+    }
+}
+
+void OpenGLState::ApplyPointSize() const {
+    if (UpdateValue(cur_state.point.size, point.size)) {
+        glPointSize(point.size);
+    }
+}
+
+void OpenGLState::ApplyFragmentColorClamp() const {
+    if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) {
+        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
+                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
+    }
+}
+
+void OpenGLState::ApplyMultisample() const {
+    Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage,
+           multisample_control.alpha_to_coverage);
+    Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one,
+           multisample_control.alpha_to_one);
+}
+
+void OpenGLState::ApplyDepthClamp() const {
+    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
+        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
+        return;
+    }
+    cur_state.depth_clamp = depth_clamp;
+
+    UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
+                         "Unimplemented Depth Clamp Separation!");
+
+    Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane);
+}
+
 void OpenGLState::ApplySRgb() const {
-    if (framebuffer_srgb.enabled != cur_state.framebuffer_srgb.enabled) {
-        if (framebuffer_srgb.enabled) {
-            // Track if sRGB is used
-            s_rgb_used = true;
-            glEnable(GL_FRAMEBUFFER_SRGB);
-        } else {
-            glDisable(GL_FRAMEBUFFER_SRGB);
-        }
+    if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled)
+        return;
+    cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
+    if (framebuffer_srgb.enabled) {
+        // Track if sRGB is used
+        s_rgb_used = true;
+        glEnable(GL_FRAMEBUFFER_SRGB);
+    } else {
+        glDisable(GL_FRAMEBUFFER_SRGB);
     }
 }
 
 void OpenGLState::ApplyCulling() const {
-    if (cull.enabled != cur_state.cull.enabled) {
-        if (cull.enabled) {
-            glEnable(GL_CULL_FACE);
-        } else {
-            glDisable(GL_CULL_FACE);
-        }
-    }
+    Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled);
 
-    if (cull.mode != cur_state.cull.mode) {
+    if (UpdateValue(cur_state.cull.mode, cull.mode)) {
         glCullFace(cull.mode);
     }
 
-    if (cull.front_face != cur_state.cull.front_face) {
+    if (UpdateValue(cur_state.cull.front_face, cull.front_face)) {
         glFrontFace(cull.front_face);
     }
 }
 
 void OpenGLState::ApplyColorMask() const {
-    if (independant_blend.enabled) {
-        for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
-            const auto& updated = color_mask[i];
-            const auto& current = cur_state.color_mask[i];
-            if (updated.red_enabled != current.red_enabled ||
-                updated.green_enabled != current.green_enabled ||
-                updated.blue_enabled != current.blue_enabled ||
-                updated.alpha_enabled != current.alpha_enabled) {
-                glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
-                             updated.blue_enabled, updated.alpha_enabled);
-            }
-        }
-    } else {
-        const auto& updated = color_mask[0];
-        const auto& current = cur_state.color_mask[0];
+    for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
+        const auto& updated = color_mask[i];
+        auto& current = cur_state.color_mask[i];
         if (updated.red_enabled != current.red_enabled ||
             updated.green_enabled != current.green_enabled ||
             updated.blue_enabled != current.blue_enabled ||
             updated.alpha_enabled != current.alpha_enabled) {
-            glColorMask(updated.red_enabled, updated.green_enabled, updated.blue_enabled,
-                        updated.alpha_enabled);
+            current = updated;
+            glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
+                         updated.blue_enabled, updated.alpha_enabled);
         }
     }
 }
 
 void OpenGLState::ApplyDepth() const {
-    if (depth.test_enabled != cur_state.depth.test_enabled) {
-        if (depth.test_enabled) {
-            glEnable(GL_DEPTH_TEST);
-        } else {
-            glDisable(GL_DEPTH_TEST);
-        }
-    }
+    Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled);
 
-    if (depth.test_func != cur_state.depth.test_func) {
+    if (cur_state.depth.test_func != depth.test_func) {
+        cur_state.depth.test_func = depth.test_func;
         glDepthFunc(depth.test_func);
     }
 
-    if (depth.write_mask != cur_state.depth.write_mask) {
+    if (cur_state.depth.write_mask != depth.write_mask) {
+        cur_state.depth.write_mask = depth.write_mask;
         glDepthMask(depth.write_mask);
     }
 }
 
 void OpenGLState::ApplyPrimitiveRestart() const {
-    if (primitive_restart.enabled != cur_state.primitive_restart.enabled) {
-        if (primitive_restart.enabled) {
-            glEnable(GL_PRIMITIVE_RESTART);
-        } else {
-            glDisable(GL_PRIMITIVE_RESTART);
-        }
-    }
+    Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled);
 
-    if (primitive_restart.index != cur_state.primitive_restart.index) {
+    if (cur_state.primitive_restart.index != primitive_restart.index) {
+        cur_state.primitive_restart.index = primitive_restart.index;
         glPrimitiveRestartIndex(primitive_restart.index);
     }
 }
 
 void OpenGLState::ApplyStencilTest() const {
-    if (stencil.test_enabled != cur_state.stencil.test_enabled) {
-        if (stencil.test_enabled) {
-            glEnable(GL_STENCIL_TEST);
-        } else {
-            glDisable(GL_STENCIL_TEST);
-        }
-    }
-
-    const auto ConfigStencil = [](GLenum face, const auto& config, const auto& prev_config) {
-        if (config.test_func != prev_config.test_func || config.test_ref != prev_config.test_ref ||
-            config.test_mask != prev_config.test_mask) {
+    Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled);
+
+    const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) {
+        if (current.test_func != config.test_func || current.test_ref != config.test_ref ||
+            current.test_mask != config.test_mask) {
+            current.test_func = config.test_func;
+            current.test_ref = config.test_ref;
+            current.test_mask = config.test_mask;
             glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask);
         }
-        if (config.action_depth_fail != prev_config.action_depth_fail ||
-            config.action_depth_pass != prev_config.action_depth_pass ||
-            config.action_stencil_fail != prev_config.action_stencil_fail) {
+        if (current.action_depth_fail != config.action_depth_fail ||
+            current.action_depth_pass != config.action_depth_pass ||
+            current.action_stencil_fail != config.action_stencil_fail) {
+            current.action_depth_fail = config.action_depth_fail;
+            current.action_depth_pass = config.action_depth_pass;
+            current.action_stencil_fail = config.action_stencil_fail;
             glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail,
                                 config.action_depth_pass);
         }
-        if (config.write_mask != prev_config.write_mask) {
+        if (current.write_mask != config.write_mask) {
+            current.write_mask = config.write_mask;
             glStencilMaskSeparate(face, config.write_mask);
         }
     };
     ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front);
     ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back);
 }
-// Viewport does not affects glClearBuffer so emulate viewport using scissor test
-void OpenGLState::EmulateViewportWithScissor() {
-    auto& current = viewports[0];
-    if (current.scissor.enabled) {
-        const GLint left = std::max(current.x, current.scissor.x);
-        const GLint right =
-            std::max(current.x + current.width, current.scissor.x + current.scissor.width);
-        const GLint bottom = std::max(current.y, current.scissor.y);
-        const GLint top =
-            std::max(current.y + current.height, current.scissor.y + current.scissor.height);
-        current.scissor.x = std::max(left, 0);
-        current.scissor.y = std::max(bottom, 0);
-        current.scissor.width = std::max(right - left, 0);
-        current.scissor.height = std::max(top - bottom, 0);
-    } else {
-        current.scissor.enabled = true;
-        current.scissor.x = current.x;
-        current.scissor.y = current.y;
-        current.scissor.width = current.width;
-        current.scissor.height = current.height;
-    }
-}
 
 void OpenGLState::ApplyViewport() const {
-    if (geometry_shaders.enabled) {
-        for (GLuint i = 0; i < static_cast<GLuint>(Tegra::Engines::Maxwell3D::Regs::NumViewports);
-             i++) {
-            const auto& current = cur_state.viewports[i];
-            const auto& updated = viewports[i];
-            if (updated.x != current.x || updated.y != current.y ||
-                updated.width != current.width || updated.height != current.height) {
-                glViewportIndexedf(
-                    i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
-                    static_cast<GLfloat>(updated.width), static_cast<GLfloat>(updated.height));
-            }
-            if (updated.depth_range_near != current.depth_range_near ||
-                updated.depth_range_far != current.depth_range_far) {
-                glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
-            }
-
-            if (updated.scissor.enabled != current.scissor.enabled) {
-                if (updated.scissor.enabled) {
-                    glEnablei(GL_SCISSOR_TEST, i);
-                } else {
-                    glDisablei(GL_SCISSOR_TEST, i);
-                }
-            }
-
-            if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
-                updated.scissor.width != current.scissor.width ||
-                updated.scissor.height != current.scissor.height) {
-                glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
-                                 updated.scissor.height);
-            }
-        }
-    } else {
-        const auto& current = cur_state.viewports[0];
-        const auto& updated = viewports[0];
-        if (updated.x != current.x || updated.y != current.y || updated.width != current.width ||
-            updated.height != current.height) {
-            glViewport(updated.x, updated.y, updated.width, updated.height);
-        }
-
-        if (updated.depth_range_near != current.depth_range_near ||
-            updated.depth_range_far != current.depth_range_far) {
-            glDepthRange(updated.depth_range_near, updated.depth_range_far);
+    for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) {
+        const auto& updated = viewports[i];
+        auto& current = cur_state.viewports[i];
+
+        if (current.x != updated.x || current.y != updated.y || current.width != updated.width ||
+            current.height != updated.height) {
+            current.x = updated.x;
+            current.y = updated.y;
+            current.width = updated.width;
+            current.height = updated.height;
+            glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
+                               static_cast<GLfloat>(updated.width),
+                               static_cast<GLfloat>(updated.height));
         }
-
-        if (updated.scissor.enabled != current.scissor.enabled) {
-            if (updated.scissor.enabled) {
-                glEnable(GL_SCISSOR_TEST);
-            } else {
-                glDisable(GL_SCISSOR_TEST);
-            }
+        if (current.depth_range_near != updated.depth_range_near ||
+            current.depth_range_far != updated.depth_range_far) {
+            current.depth_range_near = updated.depth_range_near;
+            current.depth_range_far = updated.depth_range_far;
+            glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
         }
 
-        if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
-            updated.scissor.width != current.scissor.width ||
-            updated.scissor.height != current.scissor.height) {
-            glScissor(updated.scissor.x, updated.scissor.y, updated.scissor.width,
-                      updated.scissor.height);
+        Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled);
+
+        if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y ||
+            current.scissor.width != updated.scissor.width ||
+            current.scissor.height != updated.scissor.height) {
+            current.scissor.x = updated.scissor.x;
+            current.scissor.y = updated.scissor.y;
+            current.scissor.width = updated.scissor.width;
+            current.scissor.height = updated.scissor.height;
+            glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
+                             updated.scissor.height);
         }
     }
 }
 
 void OpenGLState::ApplyGlobalBlending() const {
-    const Blend& current = cur_state.blend[0];
     const Blend& updated = blend[0];
-    if (updated.enabled != current.enabled) {
-        if (updated.enabled) {
-            glEnable(GL_BLEND);
-        } else {
-            glDisable(GL_BLEND);
-        }
-    }
-    if (!updated.enabled) {
-        return;
-    }
-    if (updated.src_rgb_func != current.src_rgb_func ||
-        updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
-        updated.dst_a_func != current.dst_a_func) {
+    Blend& current = cur_state.blend[0];
+
+    Enable(GL_BLEND, current.enabled, updated.enabled);
+
+    if (current.src_rgb_func != updated.src_rgb_func ||
+        current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func ||
+        current.dst_a_func != updated.dst_a_func) {
+        current.src_rgb_func = updated.src_rgb_func;
+        current.dst_rgb_func = updated.dst_rgb_func;
+        current.src_a_func = updated.src_a_func;
+        current.dst_a_func = updated.dst_a_func;
         glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
                             updated.dst_a_func);
     }
 
-    if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
+    if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) {
+        current.rgb_equation = updated.rgb_equation;
+        current.a_equation = updated.a_equation;
         glBlendEquationSeparate(updated.rgb_equation, updated.a_equation);
     }
 }
 
 void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
     const Blend& updated = blend[target];
-    const Blend& current = cur_state.blend[target];
-    if (updated.enabled != current.enabled || force) {
-        if (updated.enabled) {
-            glEnablei(GL_BLEND, static_cast<GLuint>(target));
-        } else {
-            glDisablei(GL_BLEND, static_cast<GLuint>(target));
-        }
+    Blend& current = cur_state.blend[target];
+
+    if (current.enabled != updated.enabled || force) {
+        current.enabled = updated.enabled;
+        Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled);
     }
 
-    if (updated.src_rgb_func != current.src_rgb_func ||
-        updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
-        updated.dst_a_func != current.dst_a_func) {
+    if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func,
+                           current.dst_a_func),
+                  std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
+                           updated.dst_a_func))) {
         glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func,
                              updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func);
     }
 
-    if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
+    if (UpdateTie(std::tie(current.rgb_equation, current.a_equation),
+                  std::tie(updated.rgb_equation, updated.a_equation))) {
         glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation,
                                  updated.a_equation);
     }
@@ -364,77 +415,48 @@ void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
 
 void OpenGLState::ApplyBlending() const {
     if (independant_blend.enabled) {
-        for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
-            ApplyTargetBlending(i,
-                                independant_blend.enabled != cur_state.independant_blend.enabled);
+        const bool force = independant_blend.enabled != cur_state.independant_blend.enabled;
+        for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) {
+            ApplyTargetBlending(target, force);
         }
     } else {
         ApplyGlobalBlending();
     }
-    if (blend_color.red != cur_state.blend_color.red ||
-        blend_color.green != cur_state.blend_color.green ||
-        blend_color.blue != cur_state.blend_color.blue ||
-        blend_color.alpha != cur_state.blend_color.alpha) {
+    cur_state.independant_blend.enabled = independant_blend.enabled;
+
+    if (UpdateTie(
+            std::tie(cur_state.blend_color.red, cur_state.blend_color.green,
+                     cur_state.blend_color.blue, cur_state.blend_color.alpha),
+            std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) {
         glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha);
     }
 }
 
 void OpenGLState::ApplyLogicOp() const {
-    if (logic_op.enabled != cur_state.logic_op.enabled) {
-        if (logic_op.enabled) {
-            glEnable(GL_COLOR_LOGIC_OP);
-        } else {
-            glDisable(GL_COLOR_LOGIC_OP);
-        }
-    }
+    Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled);
 
-    if (logic_op.operation != cur_state.logic_op.operation) {
+    if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) {
         glLogicOp(logic_op.operation);
     }
 }
 
 void OpenGLState::ApplyPolygonOffset() const {
-    const bool fill_enable_changed =
-        polygon_offset.fill_enable != cur_state.polygon_offset.fill_enable;
-    const bool line_enable_changed =
-        polygon_offset.line_enable != cur_state.polygon_offset.line_enable;
-    const bool point_enable_changed =
-        polygon_offset.point_enable != cur_state.polygon_offset.point_enable;
-    const bool factor_changed = polygon_offset.factor != cur_state.polygon_offset.factor;
-    const bool units_changed = polygon_offset.units != cur_state.polygon_offset.units;
-    const bool clamp_changed = polygon_offset.clamp != cur_state.polygon_offset.clamp;
-
-    if (fill_enable_changed) {
-        if (polygon_offset.fill_enable) {
-            glEnable(GL_POLYGON_OFFSET_FILL);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_FILL);
-        }
-    }
-
-    if (line_enable_changed) {
-        if (polygon_offset.line_enable) {
-            glEnable(GL_POLYGON_OFFSET_LINE);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_LINE);
-        }
-    }
-
-    if (point_enable_changed) {
-        if (polygon_offset.point_enable) {
-            glEnable(GL_POLYGON_OFFSET_POINT);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_POINT);
-        }
-    }
-
-    if (factor_changed || units_changed || clamp_changed) {
+    Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable,
+           polygon_offset.fill_enable);
+    Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable,
+           polygon_offset.line_enable);
+    Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable,
+           polygon_offset.point_enable);
+
+    if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units,
+                           cur_state.polygon_offset.clamp),
+                  std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) {
         if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) {
             glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp);
         } else {
-            glPolygonOffset(polygon_offset.factor, polygon_offset.units);
             UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0,
                                  "Unimplemented Depth polygon offset clamp.");
+            glPolygonOffset(polygon_offset.factor, polygon_offset.units);
         }
     }
 }
@@ -443,22 +465,21 @@ void OpenGLState::ApplyTextures() const {
     bool has_delta{};
     std::size_t first{};
     std::size_t last{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures;
+    std::array<GLuint, Maxwell::NumTextureSamplers> textures;
 
     for (std::size_t i = 0; i < std::size(texture_units); ++i) {
         const auto& texture_unit = texture_units[i];
-        const auto& cur_state_texture_unit = cur_state.texture_units[i];
+        auto& cur_state_texture_unit = cur_state.texture_units[i];
         textures[i] = texture_unit.texture;
-
-        if (textures[i] != cur_state_texture_unit.texture) {
-            if (!has_delta) {
-                first = i;
-                has_delta = true;
-            }
-            last = i;
+        if (cur_state_texture_unit.texture == textures[i])
+            continue;
+        cur_state_texture_unit.texture = textures[i];
+        if (!has_delta) {
+            first = i;
+            has_delta = true;
         }
+        last = i;
     }
-
     if (has_delta) {
         glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
                        textures.data() + first);
@@ -469,16 +490,18 @@ void OpenGLState::ApplySamplers() const {
     bool has_delta{};
     std::size_t first{};
     std::size_t last{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers;
+    std::array<GLuint, Maxwell::NumTextureSamplers> samplers;
+
     for (std::size_t i = 0; i < std::size(samplers); ++i) {
+        if (cur_state.texture_units[i].sampler == texture_units[i].sampler)
+            continue;
+        cur_state.texture_units[i].sampler = texture_units[i].sampler;
         samplers[i] = texture_units[i].sampler;
-        if (samplers[i] != cur_state.texture_units[i].sampler) {
-            if (!has_delta) {
-                first = i;
-                has_delta = true;
-            }
-            last = i;
+        if (!has_delta) {
+            first = i;
+            has_delta = true;
         }
+        last = i;
     }
     if (has_delta) {
         glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
@@ -486,81 +509,15 @@ void OpenGLState::ApplySamplers() const {
     }
 }
 
-void OpenGLState::ApplyFramebufferState() const {
-    if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
-        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
-    }
-    if (draw.draw_framebuffer != cur_state.draw.draw_framebuffer) {
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
-    }
-}
-
-void OpenGLState::ApplyVertexArrayState() const {
-    if (draw.vertex_array != cur_state.draw.vertex_array) {
-        glBindVertexArray(draw.vertex_array);
-    }
-}
-
-void OpenGLState::ApplyDepthClamp() const {
-    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
-        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
-        return;
-    }
-    UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
-                         "Unimplemented Depth Clamp Separation!");
-
-    if (depth_clamp.far_plane || depth_clamp.near_plane) {
-        glEnable(GL_DEPTH_CLAMP);
-    } else {
-        glDisable(GL_DEPTH_CLAMP);
-    }
-}
-
 void OpenGLState::Apply() const {
     ApplyFramebufferState();
     ApplyVertexArrayState();
-
-    // Shader program
-    if (draw.shader_program != cur_state.draw.shader_program) {
-        glUseProgram(draw.shader_program);
-    }
-
-    // Program pipeline
-    if (draw.program_pipeline != cur_state.draw.program_pipeline) {
-        glBindProgramPipeline(draw.program_pipeline);
-    }
-    // Clip distance
-    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
-        if (clip_distance[i] != cur_state.clip_distance[i]) {
-            if (clip_distance[i]) {
-                glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
-            } else {
-                glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
-            }
-        }
-    }
-    // Point
-    if (point.size != cur_state.point.size) {
-        glPointSize(point.size);
-    }
-    if (fragment_color_clamp.enabled != cur_state.fragment_color_clamp.enabled) {
-        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
-                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
-    }
-    if (multisample_control.alpha_to_coverage != cur_state.multisample_control.alpha_to_coverage) {
-        if (multisample_control.alpha_to_coverage) {
-            glEnable(GL_SAMPLE_ALPHA_TO_COVERAGE);
-        } else {
-            glDisable(GL_SAMPLE_ALPHA_TO_COVERAGE);
-        }
-    }
-    if (multisample_control.alpha_to_one != cur_state.multisample_control.alpha_to_one) {
-        if (multisample_control.alpha_to_one) {
-            glEnable(GL_SAMPLE_ALPHA_TO_ONE);
-        } else {
-            glDisable(GL_SAMPLE_ALPHA_TO_ONE);
-        }
-    }
+    ApplyShaderProgram();
+    ApplyProgramPipeline();
+    ApplyClipDistances();
+    ApplyPointSize();
+    ApplyFragmentColorClamp();
+    ApplyMultisample();
     ApplyDepthClamp();
     ApplyColorMask();
     ApplyViewport();
@@ -574,7 +531,28 @@ void OpenGLState::Apply() const {
     ApplyTextures();
     ApplySamplers();
     ApplyPolygonOffset();
-    cur_state = *this;
+}
+
+void OpenGLState::EmulateViewportWithScissor() {
+    auto& current = viewports[0];
+    if (current.scissor.enabled) {
+        const GLint left = std::max(current.x, current.scissor.x);
+        const GLint right =
+            std::max(current.x + current.width, current.scissor.x + current.scissor.width);
+        const GLint bottom = std::max(current.y, current.scissor.y);
+        const GLint top =
+            std::max(current.y + current.height, current.scissor.y + current.scissor.height);
+        current.scissor.x = std::max(left, 0);
+        current.scissor.y = std::max(bottom, 0);
+        current.scissor.width = std::max(right - left, 0);
+        current.scissor.height = std::max(top - bottom, 0);
+    } else {
+        current.scissor.enabled = true;
+        current.scissor.x = current.x;
+        current.scissor.y = current.y;
+        current.scissor.width = current.width;
+        current.scissor.height = current.height;
+    }
 }
 
 OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 9e1eda5b1..41418a7b8 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -54,10 +54,6 @@ public:
     } depth_clamp; // GL_DEPTH_CLAMP
 
     struct {
-        bool enabled; // viewports arrays are only supported when geometry shaders are enabled.
-    } geometry_shaders;
-
-    struct {
         bool enabled;      // GL_CULL_FACE
         GLenum mode;       // GL_CULL_FACE_MODE
         GLenum front_face; // GL_FRONT_FACE
@@ -184,34 +180,26 @@ public:
     static OpenGLState GetCurState() {
         return cur_state;
     }
+
     static bool GetsRGBUsed() {
         return s_rgb_used;
     }
+
     static void ClearsRGBUsed() {
         s_rgb_used = false;
     }
+
     /// Apply this state as the current OpenGL state
     void Apply() const;
-    /// Apply only the state affecting the framebuffer
+
     void ApplyFramebufferState() const;
-    /// Apply only the state affecting the vertex array
     void ApplyVertexArrayState() const;
-    /// Set the initial OpenGL state
-    static void ApplyDefaultState();
-    /// Resets any references to the given resource
-    OpenGLState& UnbindTexture(GLuint handle);
-    OpenGLState& ResetSampler(GLuint handle);
-    OpenGLState& ResetProgram(GLuint handle);
-    OpenGLState& ResetPipeline(GLuint handle);
-    OpenGLState& ResetVertexArray(GLuint handle);
-    OpenGLState& ResetFramebuffer(GLuint handle);
-    void EmulateViewportWithScissor();
-
-private:
-    static OpenGLState cur_state;
-    // Workaround for sRGB problems caused by
-    // QT not supporting srgb output
-    static bool s_rgb_used;
+    void ApplyShaderProgram() const;
+    void ApplyProgramPipeline() const;
+    void ApplyClipDistances() const;
+    void ApplyPointSize() const;
+    void ApplyFragmentColorClamp() const;
+    void ApplyMultisample() const;
     void ApplySRgb() const;
     void ApplyCulling() const;
     void ApplyColorMask() const;
@@ -227,6 +215,26 @@ private:
     void ApplySamplers() const;
     void ApplyDepthClamp() const;
     void ApplyPolygonOffset() const;
+
+    /// Set the initial OpenGL state
+    static void ApplyDefaultState();
+
+    /// Resets any references to the given resource
+    OpenGLState& UnbindTexture(GLuint handle);
+    OpenGLState& ResetSampler(GLuint handle);
+    OpenGLState& ResetProgram(GLuint handle);
+    OpenGLState& ResetPipeline(GLuint handle);
+    OpenGLState& ResetVertexArray(GLuint handle);
+    OpenGLState& ResetFramebuffer(GLuint handle);
+
+    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
+    void EmulateViewportWithScissor();
+
+private:
+    static OpenGLState cur_state;
+
+    // Workaround for sRGB problems caused by QT not supporting srgb output
+    static bool s_rgb_used;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b97576309..d69cba9c3 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,7 +5,6 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdlib>
-#include <cstring>
 #include <memory>
 #include <glad/glad.h>
 #include "common/assert.h"
@@ -164,8 +163,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
         // Reset the screen info's display texture to its own permanent texture
         screen_info.display_texture = screen_info.texture.resource.handle;
 
-        Memory::RasterizerFlushVirtualRegion(framebuffer_addr, size_in_bytes,
-                                             Memory::FlushMode::Flush);
+        rasterizer->FlushRegion(ToCacheAddr(Memory::GetPointer(framebuffer_addr)), size_in_bytes);
 
         constexpr u32 linear_bpp = 4;
         VideoCore::MortonCopyPixels128(VideoCore::MortonSwizzleMode::MortonToLinear,
@@ -267,7 +265,7 @@ void RendererOpenGL::CreateRasterizer() {
     }
     // Initialize sRGB Usage
     OpenGLState::ClearsRGBUsed();
-    rasterizer = std::make_unique<RasterizerOpenGL>(render_window, system, screen_info);
+    rasterizer = std::make_unique<RasterizerOpenGL>(system, screen_info);
 }
 
 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 4a33a6c84..02a9f5ecb 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -10,6 +10,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "core/memory.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -17,6 +18,11 @@
 
 namespace Vulkan {
 
+CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset,
+                                     std::size_t alignment, u8* host_ptr)
+    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
+      alignment{alignment} {}
+
 VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager,
                              VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
                              VKMemoryManager& memory_manager, VKScheduler& scheduler, u64 size)
@@ -34,19 +40,20 @@ VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager,
 
 VKBufferCache::~VKBufferCache() = default;
 
-u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment,
-                                bool cache) {
+u64 VKBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, u64 alignment, bool cache) {
     const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)};
-    ASSERT(cpu_addr);
+    ASSERT_MSG(cpu_addr, "Invalid GPU address");
 
     // Cache management is a big overhead, so only cache entries with a given size.
     // TODO: Figure out which size is the best for given games.
     cache &= size >= 2048;
 
+    const auto& host_ptr{Memory::GetPointer(*cpu_addr)};
     if (cache) {
-        if (auto entry = TryGet(*cpu_addr); entry) {
-            if (entry->size >= size && entry->alignment == alignment) {
-                return entry->offset;
+        auto entry = TryGet(host_ptr);
+        if (entry) {
+            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
+                return entry->GetOffset();
             }
             Unregister(entry);
         }
@@ -55,17 +62,17 @@ u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64
     AlignBuffer(alignment);
     const u64 uploaded_offset = buffer_offset;
 
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+    if (!host_ptr) {
+        return uploaded_offset;
+    }
 
+    std::memcpy(buffer_ptr, host_ptr, size);
     buffer_ptr += size;
     buffer_offset += size;
 
     if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>();
-        entry->offset = uploaded_offset;
-        entry->size = size;
-        entry->alignment = alignment;
-        entry->addr = *cpu_addr;
+        auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset,
+                                                         alignment, host_ptr);
         Register(entry);
     }
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index d8e916f31..08b786aad 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -24,22 +24,39 @@ class VKFence;
 class VKMemoryManager;
 class VKStreamBuffer;
 
-struct CachedBufferEntry final : public RasterizerCacheObject {
-    VAddr GetAddr() const override {
-        return addr;
+class CachedBufferEntry final : public RasterizerCacheObject {
+public:
+    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, std::size_t alignment,
+                               u8* host_ptr);
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
     }
 
     std::size_t GetSizeInBytes() const override {
         return size;
     }
 
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    u64 GetOffset() const {
+        return offset;
+    }
+
+    std::size_t GetAlignment() const {
+        return alignment;
+    }
+
     // We do not have to flush this cache as things in it are never modified by us.
     void Flush() override {}
 
-    VAddr addr;
-    std::size_t size;
-    u64 offset;
-    std::size_t alignment;
+private:
+    VAddr cpu_addr{};
+    std::size_t size{};
+    u64 offset{};
+    std::size_t alignment{};
 };
 
 class VKBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
@@ -51,8 +68,7 @@ public:
 
     /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
     /// allocated.
-    u64 UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment = 4,
-                     bool cache = true);
+    u64 UploadMemory(GPUVAddr gpu_addr, std::size_t size, u64 alignment = 4, bool cache = true);
 
     /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
     u64 UploadHostMemory(const u8* raw_pointer, std::size_t size, u64 alignment = 4);
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
index a1e117443..13c46e5b8 100644
--- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
@@ -21,7 +21,7 @@ public:
     CommandBufferPool(const VKDevice& device)
         : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {}
 
-    void Allocate(std::size_t begin, std::size_t end) {
+    void Allocate(std::size_t begin, std::size_t end) override {
         const auto dev = device.GetLogical();
         const auto& dld = device.GetDispatchLoader();
         const u32 graphics_family = device.GetGraphicsFamily();
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h
index 5bfe4cead..08ee86fa6 100644
--- a/src/video_core/renderer_vulkan/vk_resource_manager.h
+++ b/src/video_core/renderer_vulkan/vk_resource_manager.h
@@ -97,7 +97,7 @@ private:
 class VKFenceWatch final : public VKResource {
 public:
     explicit VKFenceWatch();
-    ~VKFenceWatch();
+    ~VKFenceWatch() override;
 
     /// Waits for the fence to be released.
     void Wait();
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
new file mode 100644
index 000000000..08279e562
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -0,0 +1,210 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/frontend/framebuffer_layout.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_swapchain.h"
+
+namespace Vulkan {
+
+namespace {
+vk::SurfaceFormatKHR ChooseSwapSurfaceFormat(const std::vector<vk::SurfaceFormatKHR>& formats) {
+    if (formats.size() == 1 && formats[0].format == vk::Format::eUndefined) {
+        return {vk::Format::eB8G8R8A8Unorm, vk::ColorSpaceKHR::eSrgbNonlinear};
+    }
+    const auto& found = std::find_if(formats.begin(), formats.end(), [](const auto& format) {
+        return format.format == vk::Format::eB8G8R8A8Unorm &&
+               format.colorSpace == vk::ColorSpaceKHR::eSrgbNonlinear;
+    });
+    return found != formats.end() ? *found : formats[0];
+}
+
+vk::PresentModeKHR ChooseSwapPresentMode(const std::vector<vk::PresentModeKHR>& modes) {
+    // Mailbox doesn't lock the application like fifo (vsync), prefer it
+    const auto& found = std::find_if(modes.begin(), modes.end(), [](const auto& mode) {
+        return mode == vk::PresentModeKHR::eMailbox;
+    });
+    return found != modes.end() ? *found : vk::PresentModeKHR::eFifo;
+}
+
+vk::Extent2D ChooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width,
+                              u32 height) {
+    constexpr auto undefined_size{std::numeric_limits<u32>::max()};
+    if (capabilities.currentExtent.width != undefined_size) {
+        return capabilities.currentExtent;
+    }
+    vk::Extent2D extent = {width, height};
+    extent.width = std::max(capabilities.minImageExtent.width,
+                            std::min(capabilities.maxImageExtent.width, extent.width));
+    extent.height = std::max(capabilities.minImageExtent.height,
+                             std::min(capabilities.maxImageExtent.height, extent.height));
+    return extent;
+}
+} // namespace
+
+VKSwapchain::VKSwapchain(vk::SurfaceKHR surface, const VKDevice& device)
+    : surface{surface}, device{device} {}
+
+VKSwapchain::~VKSwapchain() = default;
+
+void VKSwapchain::Create(u32 width, u32 height) {
+    const auto dev = device.GetLogical();
+    const auto& dld = device.GetDispatchLoader();
+    const auto physical_device = device.GetPhysical();
+
+    const vk::SurfaceCapabilitiesKHR capabilities{
+        physical_device.getSurfaceCapabilitiesKHR(surface, dld)};
+    if (capabilities.maxImageExtent.width == 0 || capabilities.maxImageExtent.height == 0) {
+        return;
+    }
+
+    dev.waitIdle(dld);
+    Destroy();
+
+    CreateSwapchain(capabilities, width, height);
+    CreateSemaphores();
+    CreateImageViews();
+
+    fences.resize(image_count, nullptr);
+}
+
+void VKSwapchain::AcquireNextImage() {
+    const auto dev{device.GetLogical()};
+    const auto& dld{device.GetDispatchLoader()};
+    dev.acquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(),
+                            *present_semaphores[frame_index], {}, &image_index, dld);
+
+    if (auto& fence = fences[image_index]; fence) {
+        fence->Wait();
+        fence->Release();
+        fence = nullptr;
+    }
+}
+
+bool VKSwapchain::Present(vk::Semaphore render_semaphore, VKFence& fence) {
+    const vk::Semaphore present_semaphore{*present_semaphores[frame_index]};
+    const std::array<vk::Semaphore, 2> semaphores{present_semaphore, render_semaphore};
+    const u32 wait_semaphore_count{render_semaphore ? 2U : 1U};
+    const auto& dld{device.GetDispatchLoader()};
+    const auto present_queue{device.GetPresentQueue()};
+    bool recreated = false;
+
+    const vk::PresentInfoKHR present_info(wait_semaphore_count, semaphores.data(), 1,
+                                          &swapchain.get(), &image_index, {});
+    switch (const auto result = present_queue.presentKHR(&present_info, dld); result) {
+    case vk::Result::eSuccess:
+        break;
+    case vk::Result::eErrorOutOfDateKHR:
+        if (current_width > 0 && current_height > 0) {
+            Create(current_width, current_height);
+            recreated = true;
+        }
+        break;
+    default:
+        LOG_CRITICAL(Render_Vulkan, "Vulkan failed to present swapchain due to {}!",
+                     vk::to_string(result));
+        UNREACHABLE();
+    }
+
+    ASSERT(fences[image_index] == nullptr);
+    fences[image_index] = &fence;
+    frame_index = (frame_index + 1) % image_count;
+    return recreated;
+}
+
+bool VKSwapchain::HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const {
+    // TODO(Rodrigo): Handle framebuffer pixel format changes
+    return framebuffer.width != current_width || framebuffer.height != current_height;
+}
+
+void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width,
+                                  u32 height) {
+    const auto dev{device.GetLogical()};
+    const auto& dld{device.GetDispatchLoader()};
+    const auto physical_device{device.GetPhysical()};
+
+    const std::vector<vk::SurfaceFormatKHR> formats{
+        physical_device.getSurfaceFormatsKHR(surface, dld)};
+
+    const std::vector<vk::PresentModeKHR> present_modes{
+        physical_device.getSurfacePresentModesKHR(surface, dld)};
+
+    const vk::SurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)};
+    const vk::PresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)};
+    extent = ChooseSwapExtent(capabilities, width, height);
+
+    current_width = extent.width;
+    current_height = extent.height;
+
+    u32 requested_image_count{capabilities.minImageCount + 1};
+    if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) {
+        requested_image_count = capabilities.maxImageCount;
+    }
+
+    vk::SwapchainCreateInfoKHR swapchain_ci(
+        {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace,
+        extent, 1, vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {},
+        capabilities.currentTransform, vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false,
+        {});
+
+    const u32 graphics_family{device.GetGraphicsFamily()};
+    const u32 present_family{device.GetPresentFamily()};
+    const std::array<u32, 2> queue_indices{graphics_family, present_family};
+    if (graphics_family != present_family) {
+        swapchain_ci.imageSharingMode = vk::SharingMode::eConcurrent;
+        swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size());
+        swapchain_ci.pQueueFamilyIndices = queue_indices.data();
+    } else {
+        swapchain_ci.imageSharingMode = vk::SharingMode::eExclusive;
+    }
+
+    swapchain = dev.createSwapchainKHRUnique(swapchain_ci, nullptr, dld);
+
+    images = dev.getSwapchainImagesKHR(*swapchain, dld);
+    image_count = static_cast<u32>(images.size());
+    image_format = surface_format.format;
+}
+
+void VKSwapchain::CreateSemaphores() {
+    const auto dev{device.GetLogical()};
+    const auto& dld{device.GetDispatchLoader()};
+
+    present_semaphores.resize(image_count);
+    for (std::size_t i = 0; i < image_count; i++) {
+        present_semaphores[i] = dev.createSemaphoreUnique({}, nullptr, dld);
+    }
+}
+
+void VKSwapchain::CreateImageViews() {
+    const auto dev{device.GetLogical()};
+    const auto& dld{device.GetDispatchLoader()};
+
+    image_views.resize(image_count);
+    for (std::size_t i = 0; i < image_count; i++) {
+        const vk::ImageViewCreateInfo image_view_ci({}, images[i], vk::ImageViewType::e2D,
+                                                    image_format, {},
+                                                    {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1});
+        image_views[i] = dev.createImageViewUnique(image_view_ci, nullptr, dld);
+    }
+}
+
+void VKSwapchain::Destroy() {
+    frame_index = 0;
+    present_semaphores.clear();
+    framebuffers.clear();
+    image_views.clear();
+    swapchain.reset();
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
new file mode 100644
index 000000000..2ad84f185
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -0,0 +1,92 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/renderer_vulkan/declarations.h"
+
+namespace Layout {
+struct FramebufferLayout;
+}
+
+namespace Vulkan {
+
+class VKDevice;
+class VKFence;
+
+class VKSwapchain {
+public:
+    explicit VKSwapchain(vk::SurfaceKHR surface, const VKDevice& device);
+    ~VKSwapchain();
+
+    /// Creates (or recreates) the swapchain with a given size.
+    void Create(u32 width, u32 height);
+
+    /// Acquires the next image in the swapchain, waits as needed.
+    void AcquireNextImage();
+
+    /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be
+    /// recreated. Takes responsability for the ownership of fence.
+    bool Present(vk::Semaphore render_semaphore, VKFence& fence);
+
+    /// Returns true when the framebuffer layout has changed.
+    bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const;
+
+    const vk::Extent2D& GetSize() const {
+        return extent;
+    }
+
+    u32 GetImageCount() const {
+        return image_count;
+    }
+
+    u32 GetImageIndex() const {
+        return image_index;
+    }
+
+    vk::Image GetImageIndex(u32 index) const {
+        return images[index];
+    }
+
+    vk::ImageView GetImageViewIndex(u32 index) const {
+        return *image_views[index];
+    }
+
+    vk::Format GetImageFormat() const {
+        return image_format;
+    }
+
+private:
+    void CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width, u32 height);
+    void CreateSemaphores();
+    void CreateImageViews();
+
+    void Destroy();
+
+    const vk::SurfaceKHR surface;
+    const VKDevice& device;
+
+    UniqueSwapchainKHR swapchain;
+
+    u32 image_count{};
+    std::vector<vk::Image> images;
+    std::vector<UniqueImageView> image_views;
+    std::vector<UniqueFramebuffer> framebuffers;
+    std::vector<VKFence*> fences;
+    std::vector<UniqueSemaphore> present_semaphores;
+
+    u32 image_index{};
+    u32 frame_index{};
+
+    vk::Format image_format{};
+    vk::Extent2D extent{};
+
+    u32 current_width{};
+    u32 current_height{};
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index a99ae19bf..a775b402b 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -7,7 +7,9 @@
 #include <fmt/format.h>
 
 #include "common/assert.h"
+#include "common/bit_field.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -41,19 +43,18 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
 
     switch (opcode->get().GetId()) {
     case OpCode::Id::TEX: {
-        UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI),
-                             "AOFFI is not implemented");
-
         if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
             LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
         }
 
         const TextureType texture_type{instr.tex.texture_type};
         const bool is_array = instr.tex.array != 0;
+        const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
         const bool depth_compare = instr.tex.UsesMiscMode(TextureMiscMode::DC);
         const auto process_mode = instr.tex.GetTextureProcessMode();
         WriteTexInstructionFloat(
-            bb, instr, GetTexCode(instr, texture_type, process_mode, depth_compare, is_array));
+            bb, instr,
+            GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi));
         break;
     }
     case OpCode::Id::TEXS: {
@@ -78,8 +79,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     }
     case OpCode::Id::TLD4: {
         ASSERT(instr.tld4.array == 0);
-        UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI),
-                             "AOFFI is not implemented");
         UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::NDV),
                              "NDV is not implemented");
         UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP),
@@ -92,8 +91,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         const auto texture_type = instr.tld4.texture_type.Value();
         const bool depth_compare = instr.tld4.UsesMiscMode(TextureMiscMode::DC);
         const bool is_array = instr.tld4.array != 0;
-        WriteTexInstructionFloat(bb, instr,
-                                 GetTld4Code(instr, texture_type, depth_compare, is_array));
+        const bool is_aoffi = instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI);
+        WriteTexInstructionFloat(
+            bb, instr, GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi));
         break;
     }
     case OpCode::Id::TLD4S: {
@@ -127,7 +127,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         Node4 values;
         for (u32 element = 0; element < values.size(); ++element) {
             auto coords_copy = coords;
-            MetaTexture meta{sampler, {}, {}, {}, {}, component, element};
+            MetaTexture meta{sampler, {}, {}, {}, {}, {}, component, element};
             values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
         }
 
@@ -152,7 +152,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                 if (!instr.txq.IsComponentEnabled(element)) {
                     continue;
                 }
-                MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+                MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
                 const Node value =
                     Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8));
                 SetTemporal(bb, indexer++, value);
@@ -202,7 +202,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
 
         for (u32 element = 0; element < 2; ++element) {
             auto params = coords;
-            MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+            MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
             const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
             SetTemporal(bb, element, value);
         }
@@ -325,7 +325,8 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
 
 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                TextureProcessMode process_mode, std::vector<Node> coords,
-                               Node array, Node depth_compare, u32 bias_offset) {
+                               Node array, Node depth_compare, u32 bias_offset,
+                               std::vector<Node> aoffi) {
     const bool is_array = array;
     const bool is_shadow = depth_compare;
 
@@ -374,7 +375,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto copy_coords = coords;
-        MetaTexture meta{sampler, array, depth_compare, bias, lod, {}, element};
+        MetaTexture meta{sampler, array, depth_compare, aoffi, bias, lod, {}, element};
         values[element] = Operation(read_method, meta, std::move(copy_coords));
     }
 
@@ -382,9 +383,15 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
 }
 
 Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
-                           TextureProcessMode process_mode, bool depth_compare, bool is_array) {
-    const bool lod_bias_enabled =
-        (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ);
+                           TextureProcessMode process_mode, bool depth_compare, bool is_array,
+                           bool is_aoffi) {
+    const bool lod_bias_enabled{
+        (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ)};
+
+    u64 parameter_register = instr.gpr20.Value();
+    if (lod_bias_enabled) {
+        ++parameter_register;
+    }
 
     const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement(
         texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5);
@@ -404,15 +411,19 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
 
     const Node array = is_array ? GetRegister(array_register) : nullptr;
 
+    std::vector<Node> aoffi;
+    if (is_aoffi) {
+        aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false);
+    }
+
     Node dc{};
     if (depth_compare) {
         // Depth is always stored in the register signaled by gpr20 or in the next register if lod
         // or bias are used
-        const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0);
-        dc = GetRegister(depth_register);
+        dc = GetRegister(parameter_register++);
     }
 
-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0);
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0, aoffi);
 }
 
 Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
@@ -448,11 +459,11 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
         dc = GetRegister(depth_register);
     }
 
-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset);
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {});
 }
 
 Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare,
-                            bool is_array) {
+                            bool is_array, bool is_aoffi) {
     const std::size_t coord_count = GetCoordCount(texture_type);
     const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
     const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
@@ -463,15 +474,27 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
     const u64 coord_register = array_register + (is_array ? 1 : 0);
 
     std::vector<Node> coords;
-    for (size_t i = 0; i < coord_count; ++i)
+    for (std::size_t i = 0; i < coord_count; ++i) {
         coords.push_back(GetRegister(coord_register + i));
+    }
+
+    u64 parameter_register = instr.gpr20.Value();
+    std::vector<Node> aoffi;
+    if (is_aoffi) {
+        aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, true);
+    }
+
+    Node dc{};
+    if (depth_compare) {
+        dc = GetRegister(parameter_register++);
+    }
 
     const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, depth_compare);
 
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, GetRegister(array_register), {}, {}, {}, {}, element};
+        MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, element};
         values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
     }
 
@@ -507,7 +530,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, array, {}, {}, lod, {}, element};
+        MetaTexture meta{sampler, array, {}, {}, {}, lod, {}, element};
         values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
     }
     return values;
@@ -531,4 +554,45 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement(
     return {coord_count, total_coord_count};
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count,
+                                                bool is_tld4) {
+    const auto [coord_offsets, size, wrap_value,
+                diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> {
+        if (is_tld4) {
+            return {{0, 8, 16}, 6, 32, 64};
+        } else {
+            return {{0, 4, 8}, 4, 8, 16};
+        }
+    }();
+    const u32 mask = (1U << size) - 1;
+
+    std::vector<Node> aoffi;
+    aoffi.reserve(coord_count);
+
+    const auto aoffi_immediate{
+        TrackImmediate(aoffi_reg, global_code, static_cast<s64>(global_code.size()))};
+    if (!aoffi_immediate) {
+        // Variable access, not supported on AMD.
+        LOG_WARNING(HW_GPU,
+                    "AOFFI constant folding failed, some hardware might have graphical issues");
+        for (std::size_t coord = 0; coord < coord_count; ++coord) {
+            const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size);
+            const Node condition =
+                Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value));
+            const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value));
+            aoffi.push_back(Operation(OperationCode::Select, condition, negative, value));
+        }
+        return aoffi;
+    }
+
+    for (std::size_t coord = 0; coord < coord_count; ++coord) {
+        s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask;
+        if (value >= wrap_value) {
+            value -= diff_value;
+        }
+        aoffi.push_back(Immediate(value));
+    }
+    return aoffi;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 5bc3a3900..4888998d3 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstring>
 #include <map>
+#include <optional>
 #include <set>
 #include <string>
 #include <tuple>
@@ -290,6 +291,7 @@ struct MetaTexture {
     const Sampler& sampler;
     Node array{};
     Node depth_compare{};
+    std::vector<Node> aoffi;
     Node bias{};
     Node lod{};
     Node component{};
@@ -741,14 +743,14 @@ private:
 
     Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                      Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
-                     bool is_array);
+                     bool is_array, bool is_aoffi);
 
     Node4 GetTexsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                       Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
                       bool is_array);
 
     Node4 GetTld4Code(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
-                      bool depth_compare, bool is_array);
+                      bool depth_compare, bool is_array, bool is_aoffi);
 
     Node4 GetTldsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                       bool is_array);
@@ -757,9 +759,11 @@ private:
         Tegra::Shader::TextureType texture_type, bool depth_compare, bool is_array,
         bool lod_bias_enabled, std::size_t max_coords, std::size_t max_inputs);
 
+    std::vector<Node> GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4);
+
     Node4 GetTextureCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                          Tegra::Shader::TextureProcessMode process_mode, std::vector<Node> coords,
-                         Node array, Node depth_compare, u32 bias_offset);
+                         Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi);
 
     Node GetVideoOperand(Node op, bool is_chunk, bool is_signed, Tegra::Shader::VideoType type,
                          u64 byte_height);
@@ -773,6 +777,8 @@ private:
 
     Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor);
 
+    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor);
+
     std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
 
     template <typename... T>
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 33b071747..4505667ff 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -6,6 +6,7 @@
 #include <utility>
 #include <variant>
 
+#include "common/common_types.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -14,7 +15,7 @@ namespace {
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
-        const Node node = code[cursor];
+        const Node node = code.at(cursor);
         if (const auto operation = std::get_if<OperationNode>(node)) {
             if (operation->GetCode() == operation_code)
                 return {node, cursor};
@@ -64,6 +65,20 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
     return nullptr;
 }
 
+std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) {
+    // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
+    // that it uses as operand
+    const auto [found, found_cursor] =
+        TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
+    if (!found) {
+        return {};
+    }
+    if (const auto immediate = std::get_if<ImmediateNode>(found)) {
+        return immediate->GetValue();
+    }
+    return {};
+}
+
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
                                              s64 cursor) {
     for (; cursor >= 0; --cursor) {
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index cad7340f5..995d0e068 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -6,7 +6,6 @@
 #include <cstring>
 #include "common/alignment.h"
 #include "common/assert.h"
-#include "core/memory.h"
 #include "video_core/gpu.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/textures/texture.h"
@@ -230,18 +229,18 @@ u32 BytesPerPixel(TextureFormat format) {
     }
 }
 
-void UnswizzleTexture(u8* const unswizzled_data, VAddr address, u32 tile_size_x, u32 tile_size_y,
+void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,
                       u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height,
                       u32 block_depth, u32 width_spacing) {
     CopySwizzledData((width + tile_size_x - 1) / tile_size_x,
                      (height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel,
-                     bytes_per_pixel, Memory::GetPointer(address), unswizzled_data, true,
-                     block_height, block_depth, width_spacing);
+                     bytes_per_pixel, address, unswizzled_data, true, block_height, block_depth,
+                     width_spacing);
 }
 
-std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y,
-                                 u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
-                                 u32 block_height, u32 block_depth, u32 width_spacing) {
+std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel,
+                                 u32 width, u32 height, u32 depth, u32 block_height,
+                                 u32 block_depth, u32 width_spacing) {
     std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel);
     UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel,
                      width, height, depth, block_height, block_depth, width_spacing);
@@ -249,8 +248,7 @@ std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y
 }
 
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
-                    u32 block_height) {
+                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height) {
     const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
                                   gob_size_x};
     for (u32 line = 0; line < subrect_height; ++line) {
@@ -262,17 +260,17 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
             const u32 gob_address =
                 gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
             const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x];
-            const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
-            const VAddr dest_addr = swizzled_data + swizzled_offset;
+            u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
+            u8* dest_addr = swizzled_data + swizzled_offset;
 
-            Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel);
+            std::memcpy(dest_addr, source_line, bytes_per_pixel);
         }
     }
 }
 
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
-                      u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
-                      u32 block_height, u32 offset_x, u32 offset_y) {
+                      u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
+                      u32 offset_x, u32 offset_y) {
     for (u32 line = 0; line < subrect_height; ++line) {
         const u32 y2 = line + offset_y;
         const u32 gob_address_y = (y2 / (gob_size_y * block_height)) * gob_size * block_height +
@@ -282,10 +280,10 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
             const u32 x2 = (x + offset_x) * bytes_per_pixel;
             const u32 gob_address = gob_address_y + (x2 / gob_size_x) * gob_size * block_height;
             const u32 swizzled_offset = gob_address + table[x2 % gob_size_x];
-            const VAddr dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel;
-            const VAddr source_addr = swizzled_data + swizzled_offset;
+            u8* dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel;
+            u8* source_addr = swizzled_data + swizzled_offset;
 
-            Memory::CopyBlock(dest_line, source_addr, bytes_per_pixel);
+            std::memcpy(dest_line, source_addr, bytes_per_pixel);
         }
     }
 }
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 65df86890..e078fa274 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -17,14 +17,14 @@ inline std::size_t GetGOBSize() {
 }
 
 /// Unswizzles a swizzled texture without changing its format.
-void UnswizzleTexture(u8* unswizzled_data, VAddr address, u32 tile_size_x, u32 tile_size_y,
+void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,
                       u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                       u32 block_height = TICEntry::DefaultBlockHeight,
                       u32 block_depth = TICEntry::DefaultBlockHeight, u32 width_spacing = 0);
 
 /// Unswizzles a swizzled texture without changing its format.
-std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y,
-                                 u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
+std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel,
+                                 u32 width, u32 height, u32 depth,
                                  u32 block_height = TICEntry::DefaultBlockHeight,
                                  u32 block_depth = TICEntry::DefaultBlockHeight,
                                  u32 width_spacing = 0);
@@ -44,12 +44,11 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
 
 /// Copies an untiled subrectangle into a tiled surface.
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
-                    u32 block_height);
+                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height);
 
 /// Copies a tiled subrectangle into a linear surface.
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
-                      u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
-                      u32 block_height, u32 offset_x, u32 offset_y);
+                      u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
+                      u32 offset_x, u32 offset_y);
 
 } // namespace Tegra::Texture