144 files changed, 5562 insertions, 2482 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 3cd896a0f..abcee2a1c 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,8 +1,28 @@
+add_subdirectory(host_shaders)
+
 add_library(video_core STATIC
     buffer_cache/buffer_block.h
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    cdma_pusher.cpp
+    cdma_pusher.h
+    command_classes/codecs/codec.cpp
+    command_classes/codecs/codec.h
+    command_classes/codecs/h264.cpp
+    command_classes/codecs/h264.h
+    command_classes/codecs/vp9.cpp
+    command_classes/codecs/vp9.h
+    command_classes/codecs/vp9_types.h
+    command_classes/host1x.cpp
+    command_classes/host1x.h
+    command_classes/nvdec.cpp
+    command_classes/nvdec.h
+    command_classes/nvdec_common.h
+    command_classes/sync_manager.cpp
+    command_classes/sync_manager.h
+    command_classes/vic.cpp
+    command_classes/vic.h
     compatible_formats.cpp
     compatible_formats.h
     dirty_flags.cpp
@@ -188,6 +208,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_blit_screen.h
         renderer_vulkan/vk_buffer_cache.cpp
         renderer_vulkan/vk_buffer_cache.h
+        renderer_vulkan/vk_command_pool.cpp
+        renderer_vulkan/vk_command_pool.h
         renderer_vulkan/vk_compute_pass.cpp
         renderer_vulkan/vk_compute_pass.h
         renderer_vulkan/vk_compute_pipeline.cpp
@@ -202,6 +224,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_graphics_pipeline.h
         renderer_vulkan/vk_image.cpp
         renderer_vulkan/vk_image.h
+        renderer_vulkan/vk_master_semaphore.cpp
+        renderer_vulkan/vk_master_semaphore.h
         renderer_vulkan/vk_memory_manager.cpp
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_pipeline_cache.cpp
@@ -212,8 +236,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_rasterizer.h
         renderer_vulkan/vk_renderpass_cache.cpp
         renderer_vulkan/vk_renderpass_cache.h
-        renderer_vulkan/vk_resource_manager.cpp
-        renderer_vulkan/vk_resource_manager.h
+        renderer_vulkan/vk_resource_pool.cpp
+        renderer_vulkan/vk_resource_pool.h
         renderer_vulkan/vk_sampler_cache.cpp
         renderer_vulkan/vk_sampler_cache.h
         renderer_vulkan/vk_scheduler.cpp
@@ -244,6 +268,17 @@ create_target_directory_groups(video_core)
 target_link_libraries(video_core PUBLIC common core)
 target_link_libraries(video_core PRIVATE glad xbyak)
 
+if (MSVC)
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
+else()
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
+endif()
+
+add_dependencies(video_core host_shaders)
+target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
+
 if (ENABLE_VULKAN)
     target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
     target_compile_definitions(video_core PRIVATE HAS_VULKAN)
@@ -264,5 +299,17 @@ endif()
 if (MSVC)
     target_compile_options(video_core PRIVATE /we4267)
 else()
-    target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion)
+    target_compile_options(video_core PRIVATE
+        -Werror=conversion
+        -Wno-error=sign-conversion
+        -Werror=pessimizing-move
+        -Werror=redundant-move
+        -Werror=switch
+        -Werror=type-limits
+        -Werror=unused-variable
+
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
+    )
 endif()
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b5dc68902..e7edd733f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -51,46 +51,43 @@ public:
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        auto& memory_manager = system.GPU().MemoryManager();
-        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
-        if (!cpu_addr_opt) {
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        if (!cpu_addr) {
             return GetEmptyBuffer(size);
         }
-        const VAddr cpu_addr = *cpu_addr_opt;
 
         // Cache management is a big overhead, so only cache entries with a given size.
         // TODO: Figure out which size is the best for given games.
         constexpr std::size_t max_stream_size = 0x800;
         if (use_fast_cbuf || size < max_stream_size) {
-            if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
+            if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
+                const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
                 if (use_fast_cbuf) {
                     u8* dest;
                     if (is_granular) {
-                        dest = memory_manager.GetPointer(gpu_addr);
+                        dest = gpu_memory.GetPointer(gpu_addr);
                     } else {
                         staging_buffer.resize(size);
                         dest = staging_buffer.data();
-                        memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
                     }
                     return ConstBufferUpload(dest, size);
                 }
                 if (is_granular) {
-                    u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
+                    u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
                     return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
                         std::memcpy(dest, host_ptr, size);
                     });
                 } else {
-                    return StreamBufferUpload(
-                        size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
-                            memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
-                        });
+                    return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
+                    });
                 }
             }
         }
 
-        Buffer* const block = GetBlock(cpu_addr, size);
-        MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
+        Buffer* const block = GetBlock(*cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
         if (!map) {
             return GetEmptyBuffer(size);
         }
@@ -106,7 +103,7 @@ public:
             }
         }
 
-        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
+        return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -262,9 +259,11 @@ public:
     virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
 protected:
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                         std::unique_ptr<StreamBuffer> stream_buffer_)
+        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
+          stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {}
 
     ~BufferCache() = default;
 
@@ -326,14 +325,13 @@ private:
     MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
         const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
-            auto& memory_manager = system.GPU().MemoryManager();
             const VAddr cpu_addr_end = cpu_addr + size;
-            if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                u8* host_ptr = memory_manager.GetPointer(gpu_addr);
+            if (gpu_memory.IsGranularRange(gpu_addr, size)) {
+                u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
                 block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
-                memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
+                gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
                 block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
@@ -392,7 +390,7 @@ private:
                 continue;
             }
             staging_buffer.resize(size);
-            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
             block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
@@ -431,7 +429,7 @@ private:
         const std::size_t size = map->end - map->start;
         staging_buffer.resize(size);
         block->Download(block->Offset(map->start), size, staging_buffer.data());
-        system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
+        cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
 
@@ -567,7 +565,8 @@ private:
     }
 
     VideoCore::RasterizerInterface& rasterizer;
-    Core::System& system;
+    Tegra::MemoryManager& gpu_memory;
+    Core::Memory::Memory& cpu_memory;
 
     std::unique_ptr<StreamBuffer> stream_buffer;
     BufferType stream_buffer_handle;
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
new file mode 100644
index 000000000..b60f86260
--- /dev/null
+++ b/src/video_core/cdma_pusher.cpp
@@ -0,0 +1,171 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "command_classes/host1x.h"
+#include "command_classes/nvdec.h"
+#include "command_classes/vic.h"
+#include "common/bit_util.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+CDmaPusher::CDmaPusher(GPU& gpu)
+    : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)),
+      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x>(gpu)),
+      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
+      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
+
+CDmaPusher::~CDmaPusher() = default;
+
+void CDmaPusher::Push(ChCommandHeaderList&& entries) {
+    cdma_queue.push(std::move(entries));
+}
+
+void CDmaPusher::DispatchCalls() {
+    while (!cdma_queue.empty()) {
+        Step();
+    }
+}
+
+void CDmaPusher::Step() {
+    const auto entries{cdma_queue.front()};
+    cdma_queue.pop();
+
+    std::vector<u32> values(entries.size());
+    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
+
+    for (const u32 value : values) {
+        if (mask != 0) {
+            const u32 lbs = Common::CountTrailingZeroes32(mask);
+            mask &= ~(1U << lbs);
+            ExecuteCommand(static_cast<u32>(offset + lbs), value);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(static_cast<u32>(offset), value);
+            if (incrementing) {
+                ++offset;
+            }
+            continue;
+        }
+        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value & 0x3f;
+            offset = (value >> 16) & 0xfff;
+            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value & 0xfff;
+            offset = (value >> 16) & 0xfff;
+            ExecuteCommand(static_cast<u32>(offset), data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
+        }
+    }
+}
+
+void CDmaPusher::ExecuteCommand(u32 offset, u32 data) {
+    switch (current_class) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, offset, {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                nvdec_sync->Increment(syncpoint_id);
+            } else {
+                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                nvdec_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(
+                static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                vic_sync->Increment(syncpoint_id);
+            } else {
+                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                vic_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0), data);
+            vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0),
+                                         {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Host1x:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data});
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
+    }
+}
+
+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset;
+    std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size());
+}
+
+} // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
new file mode 100644
index 000000000..982f309c5
--- /dev/null
+++ b/src/video_core/cdma_pusher.h
@@ -0,0 +1,138 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/sync_manager.h"
+
+namespace Tegra {
+
+class GPU;
+class Nvdec;
+class Vic;
+class Host1x;
+
+enum class ChSubmissionMode : u32 {
+    SetClass = 0,
+    Incrementing = 1,
+    NonIncrementing = 2,
+    Mask = 3,
+    Immediate = 4,
+    Restart = 5,
+    Gather = 6,
+};
+
+enum class ChClassId : u32 {
+    NoClass = 0x0,
+    Host1x = 0x1,
+    VideoEncodeMpeg = 0x20,
+    VideoEncodeNvEnc = 0x21,
+    VideoStreamingVi = 0x30,
+    VideoStreamingIsp = 0x32,
+    VideoStreamingIspB = 0x34,
+    VideoStreamingViI2c = 0x36,
+    GraphicsVic = 0x5d,
+    Graphics3D = 0x60,
+    GraphicsGpu = 0x61,
+    Tsec = 0xe0,
+    TsecB = 0xe1,
+    NvJpg = 0xc0,
+    NvDec = 0xf0
+};
+
+enum class ChMethod : u32 {
+    Empty = 0,
+    SetMethod = 0x10,
+    SetData = 0x11,
+};
+
+union ChCommandHeader {
+    u32 raw;
+    BitField<0, 16, u32> value;
+    BitField<16, 12, ChMethod> method_offset;
+    BitField<28, 4, ChSubmissionMode> submission_mode;
+};
+static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
+
+struct ChCommand {
+    ChClassId class_id{};
+    int method_offset{};
+    std::vector<u32> arguments;
+};
+
+using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>;
+using ChCommandList = std::vector<Tegra::ChCommand>;
+
+struct ThiRegisters {
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
+};
+
+enum class ThiMethod : u32 {
+    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
+    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
+    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
+};
+
+class CDmaPusher {
+public:
+    explicit CDmaPusher(GPU& gpu);
+    ~CDmaPusher();
+
+    /// Push NVDEC command buffer entries into queue
+    void Push(ChCommandHeaderList&& entries);
+
+    /// Process queued command buffer entries
+    void DispatchCalls();
+
+    /// Process one queue element
+    void Step();
+
+    /// Invoke command class devices to execute the command based on the current state
+    void ExecuteCommand(u32 offset, u32 data);
+
+private:
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments);
+
+    GPU& gpu;
+
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+    std::unique_ptr<Tegra::Vic> vic_processor;
+    std::unique_ptr<Tegra::Host1x> host1x_processor;
+    std::unique_ptr<SyncptIncrManager> nvdec_sync;
+    std::unique_ptr<SyncptIncrManager> vic_sync;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};
+
+    s32 count{};
+    s32 offset{};
+    s32 mask{};
+    bool incrementing{};
+
+    // Queue of command lists to be processed
+    std::queue<ChCommandHeaderList> cdma_queue;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
new file mode 100644
index 000000000..1adf3cd13
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -0,0 +1,115 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <fstream>
+#include <vector>
+#include "common/assert.h"
+#include "video_core/command_classes/codecs/codec.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+extern "C" {
+#include <libavutil/opt.h>
+}
+
+namespace Tegra {
+
+Codec::Codec(GPU& gpu_)
+    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    avcodec_send_packet(av_codec_ctx, nullptr);
+    avcodec_receive_frame(av_codec_ctx, av_frame);
+    avcodec_flush_buffers(av_codec_ctx);
+
+    av_frame_unref(av_frame);
+    av_free(av_frame);
+    avcodec_close(av_codec_ctx);
+}
+
+void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
+    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
+    current_codec = codec;
+}
+
+void Codec::StateWrite(u32 offset, u64 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
+    std::memcpy(state_offset, &arguments, sizeof(u64));
+}
+
+void Codec::Decode() {
+    bool is_first_frame = false;
+
+    if (!initialized) {
+        if (current_codec == NvdecCommon::VideoCodec::H264) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
+        } else {
+            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec));
+            return;
+        }
+
+        av_codec_ctx = avcodec_alloc_context3(av_codec);
+        av_frame = av_frame_alloc();
+        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+
+        // TODO(ameerj): libavcodec gpu hw acceleration
+
+        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
+        if (av_error < 0) {
+            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
+            av_frame_unref(av_frame);
+            av_free(av_frame);
+            avcodec_close(av_codec_ctx);
+            return;
+        }
+        initialized = true;
+        is_first_frame = true;
+    }
+    bool vp9_hidden_frame = false;
+
+    AVPacket packet{};
+    av_init_packet(&packet);
+    std::vector<u8> frame_data;
+
+    if (current_codec == NvdecCommon::VideoCodec::H264) {
+        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
+    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+        frame_data = vp9_decoder->ComposeFrameHeader(state);
+        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
+    }
+
+    packet.data = frame_data.data();
+    packet.size = static_cast<int>(frame_data.size());
+
+    avcodec_send_packet(av_codec_ctx, &packet);
+
+    if (!vp9_hidden_frame) {
+        // Only receive/store visible frames
+        avcodec_receive_frame(av_codec_ctx, av_frame);
+    }
+}
+
+AVFrame* Codec::GetCurrentFrame() {
+    return av_frame;
+}
+
+const AVFrame* Codec::GetCurrentFrame() const {
+    return av_frame;
+}
+
+NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
+    return current_codec;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
new file mode 100644
index 000000000..5bbe6a332
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -0,0 +1,66 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#include <libavcodec/avcodec.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace Tegra {
+class GPU;
+struct VicRegisters;
+
+namespace Decoder {
+class H264;
+class VP9;
+} // namespace Decoder
+
+class Codec {
+public:
+    explicit Codec(GPU& gpu);
+    ~Codec();
+
+    /// Sets NVDEC video stream codec
+    void SetTargetCodec(NvdecCommon::VideoCodec codec);
+
+    /// Populate NvdecRegisters state with argument value at the provided offset
+    void StateWrite(u32 offset, u64 arguments);
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    /// Returns most recently decoded frame
+    [[nodiscard]] AVFrame* GetCurrentFrame();
+    [[nodiscard]] const AVFrame* GetCurrentFrame() const;
+
+    /// Returns the value of current_codec
+    [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
+
+private:
+    bool initialized{};
+    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
+
+    AVCodec* av_codec{nullptr};
+    AVCodecContext* av_codec_ctx{nullptr};
+    AVFrame* av_frame{nullptr};
+
+    GPU& gpu;
+    std::unique_ptr<Decoder::H264> h264_decoder;
+    std::unique_ptr<Decoder::VP9> vp9_decoder;
+
+    NvdecCommon::NvdecRegisters state{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
new file mode 100644
index 000000000..33e063e20
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -0,0 +1,293 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <array>
+#include "common/bit_util.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// ZigZag LUTs from libavcodec.
+constexpr std::array<u8, 64> zig_zag_direct{
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
+    41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
+    30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+constexpr std::array<u8, 16> zig_zag_scan{
+    0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+    1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
+};
+} // Anonymous namespace
+
+H264::H264(GPU& gpu_) : gpu(gpu_) {}
+
+H264::~H264() = default;
+
+const std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                bool is_first_frame) {
+    H264DecoderContext context{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+
+    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
+    if (!is_first_frame && frame_number != 0) {
+        frame.resize(context.frame_data_size);
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+    } else {
+        /// Encode header
+        H264BitWriter writer{};
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(7, 5);
+        writer.WriteU(100, 8);
+        writer.WriteU(0, 8);
+        writer.WriteU(31, 8);
+        writer.WriteUe(0);
+        const auto chroma_format_idc =
+            static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
+        writer.WriteUe(chroma_format_idc);
+        if (chroma_format_idc == 3) {
+            writer.WriteBit(false);
+        }
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+        writer.WriteBit(false); // Scaling matrix present flag
+
+        const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
+        writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
+        writer.WriteUe(order_cnt_type);
+        if (order_cnt_type == 0) {
+            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
+        } else if (order_cnt_type == 1) {
+            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+
+            writer.WriteSe(0);
+            writer.WriteSe(0);
+            writer.WriteUe(0);
+        }
+
+        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
+                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+
+        writer.WriteUe(16);
+        writer.WriteBit(false);
+        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+        writer.WriteUe(pic_height - 1);
+        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+
+        if (!context.h264_parameter_set.frame_mbs_only_flag) {
+            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
+        }
+
+        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
+        writer.WriteBit(false); // Frame cropping flag
+        writer.WriteBit(false); // VUI parameter present flag
+
+        writer.End();
+
+        // H264 PPS
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(8, 5);
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+
+        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
+        writer.WriteBit(false);
+        writer.WriteUe(0);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
+        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
+        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
+        pic_init_qp = (pic_init_qp << 26) >> 26;
+        writer.WriteSe(pic_init_qp);
+        writer.WriteSe(0);
+        s32 chroma_qp_index_offset =
+            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
+        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset);
+        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
+        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+
+        writer.WriteBit(true);
+
+        for (s32 index = 0; index < 6; index++) {
+            writer.WriteBit(true);
+            const auto matrix_x4 =
+                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
+            writer.WriteScalingList(matrix_x4, index * 16, 16);
+        }
+
+        if (context.h264_parameter_set.transform_8x8_mode_flag) {
+            for (s32 index = 0; index < 2; index++) {
+                writer.WriteBit(true);
+                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
+                                                       context.scaling_matrix_8.end());
+
+                writer.WriteScalingList(matrix_x8, index * 64, 64);
+            }
+        }
+
+        s32 chroma_qp_index_offset2 =
+            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
+        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset2);
+
+        writer.End();
+
+        const auto& encoded_header = writer.GetByteArray();
+        frame.resize(encoded_header.size() + context.frame_data_size);
+        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
+                                      frame.data() + encoded_header.size(),
+                                      context.frame_data_size);
+    }
+
+    return frame;
+}
+
+H264BitWriter::H264BitWriter() = default;
+
+H264BitWriter::~H264BitWriter() = default;
+
+void H264BitWriter::WriteU(s32 value, s32 value_sz) {
+    WriteBits(value, value_sz);
+}
+
+void H264BitWriter::WriteSe(s32 value) {
+    WriteExpGolombCodedInt(value);
+}
+
+void H264BitWriter::WriteUe(u32 value) {
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::End() {
+    WriteBit(true);
+    Flush();
+}
+
+void H264BitWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
+    std::vector<u8> scan(count);
+    if (count == 16) {
+        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
+    } else {
+        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
+    }
+    u8 last_scale = 8;
+
+    for (s32 index = 0; index < count; index++) {
+        const u8 value = list[start + scan[index]];
+        const s32 delta_scale = static_cast<s32>(value - last_scale);
+
+        WriteSe(delta_scale);
+
+        last_scale = value;
+    }
+}
+
+std::vector<u8>& H264BitWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& H264BitWriter::GetByteArray() const {
+    return byte_array;
+}
+
+void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
+    s32 value_pos = 0;
+
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free_bits = GetFreeBufferBits();
+
+        if (copy_size > free_bits) {
+            copy_size = free_bits;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
+    const s32 sign = value <= 0 ? 0 : 1;
+    if (value < 0) {
+        value = -value;
+    }
+    value = (value << 1) - sign;
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
+    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
+    WriteBits(1, size);
+
+    value -= (1U << (size - 1)) - 1;
+    WriteBits(static_cast<s32>(value), size - 1);
+}
+
+s32 H264BitWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void H264BitWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+
+    buffer = 0;
+    buffer_pos = 0;
+}
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h
new file mode 100644
index 000000000..273449495
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.h
@@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+namespace Decoder {
+
+class H264BitWriter {
+public:
+    H264BitWriter();
+    ~H264BitWriter();
+
+    /// The following Write methods are based on clause 9.1 in the H.264 specification.
+    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
+    void WriteU(s32 value, s32 value_sz);
+    void WriteSe(s32 value);
+    void WriteUe(u32 value);
+
+    /// Finalize the bitstream
+    void End();
+
+    /// append a bit to the stream, equivalent value to the state parameter
+    void WriteBit(bool state);
+
+    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
+    /// Writes the scaling matrices of the sream
+    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
+
+    /// Return the bitstream as a vector.
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    void WriteBits(s32 value, s32 bit_count);
+    void WriteExpGolombCodedInt(s32 value);
+    void WriteExpGolombCodedUInt(u32 value);
+    [[nodiscard]] s32 GetFreeBufferBits();
+    void Flush();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class H264 {
+public:
+    explicit H264(GPU& gpu);
+    ~H264();
+
+    /// Compose the H264 header of the frame for FFmpeg decoding
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                            bool is_first_frame = false);
+
+private:
+    struct H264ParameterSet {
+        u32 log2_max_pic_order_cnt{};
+        u32 delta_pic_order_always_zero_flag{};
+        u32 frame_mbs_only_flag{};
+        u32 pic_width_in_mbs{};
+        u32 pic_height_in_map_units{};
+        INSERT_PADDING_WORDS(1);
+        u32 entropy_coding_mode_flag{};
+        u32 bottom_field_pic_order_flag{};
+        u32 num_refidx_l0_default_active{};
+        u32 num_refidx_l1_default_active{};
+        u32 deblocking_filter_control_flag{};
+        u32 redundant_pic_count_flag{};
+        u32 transform_8x8_mode_flag{};
+        INSERT_PADDING_WORDS(9);
+        u64 flags{};
+        u32 frame_number{};
+        u32 frame_number2{};
+    };
+    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_BYTES(0x48);
+        u32 frame_data_size{};
+        INSERT_PADDING_BYTES(0xc);
+        H264ParameterSet h264_parameter_set{};
+        INSERT_PADDING_BYTES(0x100);
+        std::array<u8, 0x60> scaling_matrix_4;
+        std::array<u8, 0x80> scaling_matrix_8;
+    };
+    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
+
+    std::vector<u8> frame;
+    GPU& gpu;
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp
new file mode 100644
index 000000000..ab44fdc9e
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -0,0 +1,1040 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring> // for std::memcpy
+#include <numeric>
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// Default compressed header probabilities once frame context resets
+constexpr Vp9EntropyProbs default_probs{
+    .y_mode_prob{
+        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78,
+        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
+    },
+    .partition_prob{
+        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
+        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0,
+        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0,
+        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0,
+    },
+    .coef_probs{
+        195, 29,  183, 0, 84,  49,  136, 0, 8,   42,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 31,  107, 169, 0, 35,  99,  159, 0, 17,  82,  140, 0, 8,   66,  114, 0,
+        2,   44,  76,  0, 1,   19,  32,  0, 40,  132, 201, 0, 29,  114, 187, 0, 13,  91,  157, 0,
+        7,   75,  127, 0, 3,   58,  95,  0, 1,   28,  47,  0, 69,  142, 221, 0, 42,  122, 201, 0,
+        15,  91,  159, 0, 6,   67,  121, 0, 1,   42,  77,  0, 1,   17,  31,  0, 102, 148, 228, 0,
+        67,  117, 204, 0, 17,  82,  154, 0, 6,   59,  114, 0, 2,   39,  75,  0, 1,   15,  29,  0,
+        156, 57,  233, 0, 119, 57,  212, 0, 58,  48,  163, 0, 29,  40,  124, 0, 12,  30,  81,  0,
+        3,   12,  31,  0, 191, 107, 226, 0, 124, 117, 204, 0, 25,  99,  155, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 29,  148, 210, 0, 37,  126, 194, 0, 8,   93,  157, 0,
+        2,   68,  118, 0, 1,   39,  69,  0, 1,   17,  33,  0, 41,  151, 213, 0, 27,  123, 193, 0,
+        3,   82,  144, 0, 1,   58,  105, 0, 1,   32,  60,  0, 1,   13,  26,  0, 59,  159, 220, 0,
+        23,  126, 198, 0, 4,   88,  151, 0, 1,   66,  114, 0, 1,   38,  71,  0, 1,   18,  34,  0,
+        114, 136, 232, 0, 51,  114, 207, 0, 11,  83,  155, 0, 3,   56,  105, 0, 1,   33,  65,  0,
+        1,   17,  34,  0, 149, 65,  234, 0, 121, 57,  215, 0, 61,  49,  166, 0, 28,  36,  114, 0,
+        12,  25,  76,  0, 3,   16,  42,  0, 214, 49,  220, 0, 132, 63,  188, 0, 42,  65,  137, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  137, 221, 0, 104, 131, 216, 0,
+        49,  111, 192, 0, 21,  87,  155, 0, 2,   49,  87,  0, 1,   16,  28,  0, 89,  163, 230, 0,
+        90,  137, 220, 0, 29,  100, 183, 0, 10,  70,  135, 0, 2,   42,  81,  0, 1,   17,  33,  0,
+        108, 167, 237, 0, 55,  133, 222, 0, 15,  97,  179, 0, 4,   72,  135, 0, 1,   45,  85,  0,
+        1,   19,  38,  0, 124, 146, 240, 0, 66,  124, 224, 0, 17,  88,  175, 0, 4,   58,  122, 0,
+        1,   36,  75,  0, 1,   18,  37,  0, 141, 79,  241, 0, 126, 70,  227, 0, 66,  58,  182, 0,
+        30,  44,  136, 0, 12,  34,  96,  0, 2,   20,  47,  0, 229, 99,  249, 0, 143, 111, 235, 0,
+        46,  109, 192, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 82,  158, 236, 0,
+        94,  146, 224, 0, 25,  117, 191, 0, 9,   87,  149, 0, 3,   56,  99,  0, 1,   33,  57,  0,
+        83,  167, 237, 0, 68,  145, 222, 0, 10,  103, 177, 0, 2,   72,  131, 0, 1,   41,  79,  0,
+        1,   20,  39,  0, 99,  167, 239, 0, 47,  141, 224, 0, 10,  104, 178, 0, 2,   73,  133, 0,
+        1,   44,  85,  0, 1,   22,  47,  0, 127, 145, 243, 0, 71,  129, 228, 0, 17,  93,  177, 0,
+        3,   61,  124, 0, 1,   41,  84,  0, 1,   21,  52,  0, 157, 78,  244, 0, 140, 72,  231, 0,
+        69,  58,  184, 0, 31,  44,  137, 0, 14,  38,  105, 0, 8,   23,  61,  0, 125, 34,  187, 0,
+        52,  41,  133, 0, 6,   31,  56,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        37,  109, 153, 0, 51,  102, 147, 0, 23,  87,  128, 0, 8,   67,  101, 0, 1,   41,  63,  0,
+        1,   19,  29,  0, 31,  154, 185, 0, 17,  127, 175, 0, 6,   96,  145, 0, 2,   73,  114, 0,
+        1,   51,  82,  0, 1,   28,  45,  0, 23,  163, 200, 0, 10,  131, 185, 0, 2,   93,  148, 0,
+        1,   67,  111, 0, 1,   41,  69,  0, 1,   14,  24,  0, 29,  176, 217, 0, 12,  145, 201, 0,
+        3,   101, 156, 0, 1,   69,  111, 0, 1,   39,  63,  0, 1,   14,  23,  0, 57,  192, 233, 0,
+        25,  154, 215, 0, 6,   109, 167, 0, 3,   78,  118, 0, 1,   48,  69,  0, 1,   21,  29,  0,
+        202, 105, 245, 0, 108, 106, 216, 0, 18,  90,  144, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 33,  172, 219, 0, 64,  149, 206, 0, 14,  117, 177, 0, 5,   90,  141, 0,
+        2,   61,  95,  0, 1,   37,  57,  0, 33,  179, 220, 0, 11,  140, 198, 0, 1,   89,  148, 0,
+        1,   60,  104, 0, 1,   33,  57,  0, 1,   12,  21,  0, 30,  181, 221, 0, 8,   141, 198, 0,
+        1,   87,  145, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  20,  0, 32,  186, 224, 0,
+        7,   142, 198, 0, 1,   86,  143, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  22,  0,
+        57,  192, 227, 0, 20,  143, 204, 0, 3,   96,  154, 0, 1,   68,  112, 0, 1,   42,  69,  0,
+        1,   19,  32,  0, 212, 35,  215, 0, 113, 47,  169, 0, 29,  48,  105, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 74,  129, 203, 0, 106, 120, 203, 0, 49,  107, 178, 0,
+        19,  84,  144, 0, 4,   50,  84,  0, 1,   15,  25,  0, 71,  172, 217, 0, 44,  141, 209, 0,
+        15,  102, 173, 0, 6,   76,  133, 0, 2,   51,  89,  0, 1,   24,  42,  0, 64,  185, 231, 0,
+        31,  148, 216, 0, 8,   103, 175, 0, 3,   74,  131, 0, 1,   46,  81,  0, 1,   18,  30,  0,
+        65,  196, 235, 0, 25,  157, 221, 0, 5,   105, 174, 0, 1,   67,  120, 0, 1,   38,  69,  0,
+        1,   15,  30,  0, 65,  204, 238, 0, 30,  156, 224, 0, 7,   107, 177, 0, 2,   70,  124, 0,
+        1,   42,  73,  0, 1,   18,  34,  0, 225, 86,  251, 0, 144, 104, 235, 0, 42,  99,  181, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  175, 239, 0, 112, 165, 229, 0,
+        29,  136, 200, 0, 12,  103, 162, 0, 6,   77,  123, 0, 2,   53,  84,  0, 75,  183, 239, 0,
+        30,  155, 221, 0, 3,   106, 171, 0, 1,   74,  128, 0, 1,   44,  76,  0, 1,   17,  28,  0,
+        73,  185, 240, 0, 27,  159, 222, 0, 2,   107, 172, 0, 1,   75,  127, 0, 1,   42,  73,  0,
+        1,   17,  29,  0, 62,  190, 238, 0, 21,  159, 222, 0, 2,   107, 172, 0, 1,   72,  122, 0,
+        1,   40,  71,  0, 1,   18,  32,  0, 61,  199, 240, 0, 27,  161, 226, 0, 4,   113, 180, 0,
+        1,   76,  129, 0, 1,   46,  80,  0, 1,   23,  41,  0, 7,   27,  153, 0, 5,   30,  95,  0,
+        1,   16,  30,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 50,  75,  127, 0,
+        57,  75,  124, 0, 27,  67,  108, 0, 10,  54,  86,  0, 1,   33,  52,  0, 1,   12,  18,  0,
+        43,  125, 151, 0, 26,  108, 148, 0, 7,   83,  122, 0, 2,   59,  89,  0, 1,   38,  60,  0,
+        1,   17,  27,  0, 23,  144, 163, 0, 13,  112, 154, 0, 2,   75,  117, 0, 1,   50,  81,  0,
+        1,   31,  51,  0, 1,   14,  23,  0, 18,  162, 185, 0, 6,   123, 171, 0, 1,   78,  125, 0,
+        1,   51,  86,  0, 1,   31,  54,  0, 1,   14,  23,  0, 15,  199, 227, 0, 3,   150, 204, 0,
+        1,   91,  146, 0, 1,   55,  95,  0, 1,   30,  53,  0, 1,   11,  20,  0, 19,  55,  240, 0,
+        19,  59,  196, 0, 3,   52,  105, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        41,  166, 207, 0, 104, 153, 199, 0, 31,  123, 181, 0, 14,  101, 152, 0, 5,   72,  106, 0,
+        1,   36,  52,  0, 35,  176, 211, 0, 12,  131, 190, 0, 2,   88,  144, 0, 1,   60,  101, 0,
+        1,   36,  60,  0, 1,   16,  28,  0, 28,  183, 213, 0, 8,   134, 191, 0, 1,   86,  142, 0,
+        1,   56,  96,  0, 1,   30,  53,  0, 1,   12,  20,  0, 20,  190, 215, 0, 4,   135, 192, 0,
+        1,   84,  139, 0, 1,   53,  91,  0, 1,   28,  49,  0, 1,   11,  20,  0, 13,  196, 216, 0,
+        2,   137, 192, 0, 1,   86,  143, 0, 1,   57,  99,  0, 1,   32,  56,  0, 1,   13,  24,  0,
+        211, 29,  217, 0, 96,  47,  156, 0, 22,  43,  87,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 78,  120, 193, 0, 111, 116, 186, 0, 46,  102, 164, 0, 15,  80,  128, 0,
+        2,   49,  76,  0, 1,   18,  28,  0, 71,  161, 203, 0, 42,  132, 192, 0, 10,  98,  150, 0,
+        3,   69,  109, 0, 1,   44,  70,  0, 1,   18,  29,  0, 57,  186, 211, 0, 30,  140, 196, 0,
+        4,   93,  146, 0, 1,   62,  102, 0, 1,   38,  65,  0, 1,   16,  27,  0, 47,  199, 217, 0,
+        14,  145, 196, 0, 1,   88,  142, 0, 1,   57,  98,  0, 1,   36,  62,  0, 1,   15,  26,  0,
+        26,  219, 229, 0, 5,   155, 207, 0, 1,   94,  151, 0, 1,   60,  104, 0, 1,   36,  62,  0,
+        1,   16,  28,  0, 233, 29,  248, 0, 146, 47,  220, 0, 43,  52,  140, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 100, 163, 232, 0, 179, 161, 222, 0, 63,  142, 204, 0,
+        37,  113, 174, 0, 26,  89,  137, 0, 18,  68,  97,  0, 85,  181, 230, 0, 32,  146, 209, 0,
+        7,   100, 164, 0, 3,   71,  121, 0, 1,   45,  77,  0, 1,   18,  30,  0, 65,  187, 230, 0,
+        20,  148, 207, 0, 2,   97,  159, 0, 1,   68,  116, 0, 1,   40,  70,  0, 1,   14,  29,  0,
+        40,  194, 227, 0, 8,   147, 204, 0, 1,   94,  155, 0, 1,   65,  112, 0, 1,   39,  66,  0,
+        1,   14,  26,  0, 16,  208, 228, 0, 3,   151, 207, 0, 1,   98,  160, 0, 1,   67,  117, 0,
+        1,   41,  74,  0, 1,   17,  31,  0, 17,  38,  140, 0, 7,   34,  80,  0, 1,   17,  29,  0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 37,  75,  128, 0, 41,  76,  128, 0,
+        26,  66,  116, 0, 12,  52,  94,  0, 2,   32,  55,  0, 1,   10,  16,  0, 50,  127, 154, 0,
+        37,  109, 152, 0, 16,  82,  121, 0, 5,   59,  85,  0, 1,   35,  54,  0, 1,   13,  20,  0,
+        40,  142, 167, 0, 17,  110, 157, 0, 2,   71,  112, 0, 1,   44,  72,  0, 1,   27,  45,  0,
+        1,   11,  17,  0, 30,  175, 188, 0, 9,   124, 169, 0, 1,   74,  116, 0, 1,   48,  78,  0,
+        1,   30,  49,  0, 1,   11,  18,  0, 10,  222, 223, 0, 2,   150, 194, 0, 1,   83,  128, 0,
+        1,   48,  79,  0, 1,   27,  45,  0, 1,   11,  17,  0, 36,  41,  235, 0, 29,  36,  193, 0,
+        10,  27,  111, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  165, 222, 0,
+        177, 162, 215, 0, 110, 135, 195, 0, 57,  113, 168, 0, 23,  83,  120, 0, 10,  49,  61,  0,
+        85,  190, 223, 0, 36,  139, 200, 0, 5,   90,  146, 0, 1,   60,  103, 0, 1,   38,  65,  0,
+        1,   18,  30,  0, 72,  202, 223, 0, 23,  141, 199, 0, 2,   86,  140, 0, 1,   56,  97,  0,
+        1,   36,  61,  0, 1,   16,  27,  0, 55,  218, 225, 0, 13,  145, 200, 0, 1,   86,  141, 0,
+        1,   57,  99,  0, 1,   35,  61,  0, 1,   13,  22,  0, 15,  235, 212, 0, 1,   132, 184, 0,
+        1,   84,  139, 0, 1,   57,  97,  0, 1,   34,  56,  0, 1,   14,  23,  0, 181, 21,  201, 0,
+        61,  37,  123, 0, 10,  38,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        47,  106, 172, 0, 95,  104, 173, 0, 42,  93,  159, 0, 18,  77,  131, 0, 4,   50,  81,  0,
+        1,   17,  23,  0, 62,  147, 199, 0, 44,  130, 189, 0, 28,  102, 154, 0, 18,  75,  115, 0,
+        2,   44,  65,  0, 1,   12,  19,  0, 55,  153, 210, 0, 24,  130, 194, 0, 3,   93,  146, 0,
+        1,   61,  97,  0, 1,   31,  50,  0, 1,   10,  16,  0, 49,  186, 223, 0, 17,  148, 204, 0,
+        1,   96,  142, 0, 1,   53,  83,  0, 1,   26,  44,  0, 1,   11,  17,  0, 13,  217, 212, 0,
+        2,   136, 180, 0, 1,   78,  124, 0, 1,   50,  83,  0, 1,   29,  49,  0, 1,   14,  23,  0,
+        197, 13,  247, 0, 82,  17,  222, 0, 25,  17,  162, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 126, 186, 247, 0, 234, 191, 243, 0, 176, 177, 234, 0, 104, 158, 220, 0,
+        66,  128, 186, 0, 55,  90,  137, 0, 111, 197, 242, 0, 46,  158, 219, 0, 9,   104, 171, 0,
+        2,   65,  125, 0, 1,   44,  80,  0, 1,   17,  91,  0, 104, 208, 245, 0, 39,  168, 224, 0,
+        3,   109, 162, 0, 1,   79,  124, 0, 1,   50,  102, 0, 1,   43,  102, 0, 84,  220, 246, 0,
+        31,  177, 231, 0, 2,   115, 180, 0, 1,   79,  134, 0, 1,   55,  77,  0, 1,   60,  79,  0,
+        43,  243, 240, 0, 8,   180, 217, 0, 1,   115, 166, 0, 1,   84,  121, 0, 1,   51,  67,  0,
+        1,   16,  6,   0,
+    },
+    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
+    .inter_mode_prob{
+        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94,
+        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0,
+    },
+    .intra_inter_prob{9, 102, 187, 225},
+    .comp_inter_prob{9, 102, 187, 225, 0},
+    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
+    .comp_ref_prob{50, 126, 123, 221, 226},
+    .tx_32x32_prob{3, 136, 37, 5, 52, 13},
+    .tx_16x16_prob{20, 152, 15, 101},
+    .tx_8x8_prob{100, 66},
+    .skip_probs{192, 128, 64},
+    .joints{32, 64, 96},
+    .sign{128, 128},
+    .classes{
+        224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
+        216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
+    },
+    .class_0{216, 208},
+    .prob_bits{
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+    },
+    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
+    .fr{64, 96, 64, 64, 96, 64},
+    .class_0_hp{160, 160},
+    .high_precision{128, 128},
+};
+
+constexpr std::array<s32, 256> norm_lut{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+constexpr std::array<s32, 254> map_lut{
+    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
+    1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,  51,  52,  53,  54,
+    55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
+    73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+    108, 109, 7,   110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124,
+    125, 126, 127, 128, 129, 130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142,
+    143, 144, 145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176, 177,
+    178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14,  194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212,
+    213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
+    230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18,  242, 243, 244, 245, 246, 247,
+    248, 249, 250, 251, 252, 253, 19,
+};
+
+// 6.2.14 Tile size calculation
+
+[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 min_log2 = 0;
+
+    while ((64 << min_log2) < sb64_cols) {
+        min_log2++;
+    }
+
+    return min_log2;
+}
+
+[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 max_log2 = 1;
+
+    while ((sb64_cols >> max_log2) >= 4) {
+        max_log2++;
+    }
+
+    return max_log2 - 1;
+}
+
+// Recenters probability. Based on section 6.3.6 of VP9 Specification
+[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
+    if (new_prob > old_prob * 2) {
+        return new_prob;
+    }
+
+    if (new_prob >= old_prob) {
+        return (new_prob - old_prob) * 2;
+    }
+
+    return (old_prob - new_prob) * 2 - 1;
+}
+
+// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
+[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
+    new_prob--;
+    old_prob--;
+
+    std::size_t index{};
+
+    if (old_prob * 2 <= 0xff) {
+        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+    } else {
+        index = static_cast<std::size_t>(
+            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+    }
+
+    return map_lut[index];
+}
+} // Anonymous namespace
+
+VP9::VP9(GPU& gpu) : gpu(gpu) {}
+
+VP9::~VP9() = default;
+
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        WriteProbabilityDelta(writer, new_prob, old_prob);
+    }
+}
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                 const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
+        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
+    }
+}
+
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                         const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
+        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
+    }
+}
+
+void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const int delta = RemapProbability(new_prob, old_prob);
+
+    EncodeTermSubExp(writer, delta);
+}
+
+void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
+    if (WriteLessThan(writer, value, 16)) {
+        writer.Write(value, 4);
+    } else if (WriteLessThan(writer, value, 32)) {
+        writer.Write(value - 16, 4);
+    } else if (WriteLessThan(writer, value, 64)) {
+        writer.Write(value - 32, 5);
+    } else {
+        value -= 64;
+
+        constexpr s32 size = 8;
+
+        const s32 mask = (1 << size) - 191;
+
+        const s32 delta = value - mask;
+
+        if (delta < 0) {
+            writer.Write(value, size - 1);
+        } else {
+            writer.Write(delta / 2 + mask, size - 1);
+            writer.Write(delta & 1, 1);
+        }
+    }
+}
+
+bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
+    const bool is_lt = value < test;
+    writer.Write(!is_lt);
+    return is_lt;
+}
+
+void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                     const std::array<u8, 2304>& new_prob,
+                                     const std::array<u8, 2304>& old_prob) {
+    // Note: There's 1 byte added on each packet for alignment,
+    // this byte is ignored when doing updates.
+    constexpr s32 block_bytes = 2 * 2 * 6 * 6 * 4;
+
+    const auto needs_update = [&](s32 base_index) -> bool {
+        s32 index = base_index;
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 6; k++) {
+                    for (s32 l = 0; l < 6; l++) {
+                        if (new_prob[index + 0] != old_prob[index + 0] ||
+                            new_prob[index + 1] != old_prob[index + 1] ||
+                            new_prob[index + 2] != old_prob[index + 2]) {
+                            return true;
+                        }
+
+                        index += 4;
+                    }
+                }
+            }
+        }
+        return false;
+    };
+
+    for (s32 block_index = 0; block_index < 4; block_index++) {
+        const s32 base_index = block_index * block_bytes;
+        const bool update = needs_update(base_index);
+        writer.Write(update);
+
+        if (update) {
+            s32 index = base_index;
+            for (s32 i = 0; i < 2; i++) {
+                for (s32 j = 0; j < 2; j++) {
+                    for (s32 k = 0; k < 6; k++) {
+                        for (s32 l = 0; l < 6; l++) {
+                            if (k != 0 || l < 3) {
+                                WriteProbabilityUpdate(writer, new_prob[index + 0],
+                                                       old_prob[index + 0]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 1],
+                                                       old_prob[index + 1]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 2],
+                                                       old_prob[index + 2]);
+                            }
+                            index += 4;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (block_index == tx_mode) {
+            break;
+        }
+    }
+}
+
+void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        writer.Write(new_prob >> 1, 7);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();
+
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+
+    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
+
+    return vp9_info;
+}
+
+void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
+    EntropyProbs entropy{};
+    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    entropy.Convert(dst);
+}
+
+Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
+    Vp9FrameContainer frame{};
+    {
+        gpu.SyncGuestHost();
+        frame.info = GetVp9PictureInfo(state);
+
+        frame.bit_stream.resize(frame.info.bitstream_size);
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(),
+                                      frame.info.bitstream_size);
+    }
+    // Buffer two frames, saving the last show frame info
+    if (!next_next_frame.bit_stream.empty()) {
+        Vp9FrameContainer temp{
+            .info = frame.info,
+            .bit_stream = frame.bit_stream,
+        };
+        next_next_frame.info.show_frame = frame.info.last_frame_shown;
+        frame.info = next_next_frame.info;
+        frame.bit_stream = next_next_frame.bit_stream;
+        next_next_frame = std::move(temp);
+
+        if (!next_frame.bit_stream.empty()) {
+            Vp9FrameContainer temp2{
+                .info = frame.info,
+                .bit_stream = frame.bit_stream,
+            };
+            next_frame.info.show_frame = frame.info.last_frame_shown;
+            frame.info = next_frame.info;
+            frame.bit_stream = next_frame.bit_stream;
+            next_frame = std::move(temp2);
+        } else {
+            next_frame.info = frame.info;
+            next_frame.bit_stream = frame.bit_stream;
+        }
+    } else {
+        next_next_frame.info = frame.info;
+        next_next_frame.bit_stream = frame.bit_stream;
+    }
+    return frame;
+}
+
+std::vector<u8> VP9::ComposeCompressedHeader() {
+    VpxRangeEncoder writer{};
+
+    if (!current_frame_info.lossless) {
+        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
+            writer.Write(3, 2);
+            writer.Write(current_frame_info.transform_mode == 4);
+        } else {
+            writer.Write(current_frame_info.transform_mode, 2);
+        }
+    }
+
+    if (current_frame_info.transform_mode == 4) {
+        // tx_mode_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
+                               prev_frame_probs.tx_8x8_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
+                               prev_frame_probs.tx_16x16_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
+                               prev_frame_probs.tx_32x32_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
+            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
+            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
+        }
+    }
+    // read_coef_probs()  in the spec
+    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
+                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
+    // read_skip_probs()  in the spec
+    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
+                           prev_frame_probs.skip_probs);
+
+    if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
+        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
+    }
+
+    if (!current_frame_info.intra_only) {
+        // read_inter_probs() in the spec
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
+                                       prev_frame_probs.inter_mode_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
+        }
+
+        if (current_frame_info.interp_filter == 4) {
+            // read_interp_filter_probs() in the spec
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
+                                   prev_frame_probs.switchable_interp_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.switchable_interp_prob =
+                    current_frame_info.entropy.switchable_interp_prob;
+            }
+        }
+
+        // read_is_inter_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
+                               prev_frame_probs.intra_inter_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
+        }
+        // frame_reference_mode() in the spec
+        if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[2] & 1) ||
+            (current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[3] & 1)) {
+            if (current_frame_info.reference_mode >= 1) {
+                writer.Write(1, 1);
+                writer.Write(current_frame_info.reference_mode == 2);
+            } else {
+                writer.Write(0, 1);
+            }
+        }
+
+        // frame_reference_mode_probs() in the spec
+        if (current_frame_info.reference_mode == 2) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
+                                   prev_frame_probs.comp_inter_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 1) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
+                                   prev_frame_probs.single_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 0) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
+                                   prev_frame_probs.comp_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
+            }
+        }
+
+        // read_y_mode_probs
+        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
+             ++index) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
+                                   prev_frame_probs.y_mode_prob[index]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
+        }
+        // read_partition_probs
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
+                                       prev_frame_probs.partition_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
+        }
+
+        // mv_probs
+        for (s32 i = 0; i < 3; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
+                                     prev_frame_probs.joints[i]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.joints = current_frame_info.entropy.joints;
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
+                                     prev_frame_probs.sign[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
+                                         prev_frame_probs.classes[index]);
+            }
+
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
+                                     prev_frame_probs.class_0[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
+                                         prev_frame_probs.prob_bits[index]);
+            }
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 3; k++) {
+                    const int index = i * 2 * 3 + j * 3 + k;
+
+                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
+                                             prev_frame_probs.class_0_fr[index]);
+                }
+            }
+
+            for (s32 j = 0; j < 3; j++) {
+                const int index = i * 3 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
+                                         prev_frame_probs.fr[index]);
+            }
+        }
+
+        if (current_frame_info.allow_high_precision_mv) {
+            for (s32 index = 0; index < 2; index++) {
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
+                                         prev_frame_probs.class_0_hp[index]);
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
+                                         prev_frame_probs.high_precision[index]);
+            }
+        }
+
+        // save previous probs
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.sign = current_frame_info.entropy.sign;
+            prev_frame_probs.classes = current_frame_info.entropy.classes;
+            prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
+            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
+            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
+            prev_frame_probs.fr = current_frame_info.entropy.fr;
+            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
+            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
+        }
+    }
+
+    writer.End();
+    return writer.GetBuffer();
+}
+
+VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
+    VpxBitStreamWriter uncomp_writer{};
+
+    uncomp_writer.WriteU(2, 2);                                      // Frame marker.
+    uncomp_writer.WriteU(0, 2);                                      // Profile.
+    uncomp_writer.WriteBit(false);                                   // Show existing frame.
+    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame?
+    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame?
+    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
+
+    if (current_frame_info.is_key_frame) {
+        uncomp_writer.WriteU(frame_sync_code, 24);
+        uncomp_writer.WriteU(0, 3); // Color space.
+        uncomp_writer.WriteU(0, 1); // Color range.
+        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+        uncomp_writer.WriteBit(false); // Render and frame size different.
+
+        // Reset context
+        prev_frame_probs = default_probs;
+        swap_next_golden = false;
+        loop_filter_ref_deltas.fill(0);
+        loop_filter_mode_deltas.fill(0);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});
+
+        // intra only, meaning the frame can be recreated with no other references
+        current_frame_info.intra_only = true;
+
+    } else {
+
+        if (!current_frame_info.show_frame) {
+            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
+        } else {
+            current_frame_info.intra_only = false;
+        }
+        if (!current_frame_info.error_resilient_mode) {
+            uncomp_writer.WriteU(0, 2); // Reset frame context.
+        }
+
+        // Last, Golden, Altref frames
+        std::array<s32, 3> ref_frame_index{0, 1, 2};
+
+        // Set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
+            }
+        }
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
+        if (current_frame_info.intra_only) {
+            uncomp_writer.WriteU(frame_sync_code, 24);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+        } else {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
+                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
+                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
+            }
+
+            uncomp_writer.WriteBit(true);  // Frame size with refs.
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
+            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
+
+            if (current_frame_info.interp_filter != 4) {
+                uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
+            }
+        }
+    }
+
+    if (!current_frame_info.error_resilient_mode) {
+        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
+        uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
+    }
+
+    int frame_ctx_idx = 0;
+    if (!current_frame_info.show_frame) {
+        frame_ctx_idx = 1;
+    }
+
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+
+    uncomp_writer.WriteU(current_frame_info.first_level, 6);
+    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
+    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
+
+    if (current_frame_info.mode_ref_delta_enabled) {
+        // check if ref deltas are different, update accordingly
+        std::array<bool, 4> update_loop_filter_ref_deltas;
+        std::array<bool, 2> update_loop_filter_mode_deltas;
+
+        bool loop_filter_delta_update = false;
+
+        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_ref_deltas[index];
+            const s8 new_deltas = current_frame_info.ref_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_ref_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_mode_deltas[index];
+            const s8 new_deltas = current_frame_info.mode_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_mode_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        uncomp_writer.WriteBit(loop_filter_delta_update);
+
+        if (loop_filter_delta_update) {
+            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
+
+                if (update_loop_filter_ref_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
+                }
+            }
+
+            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
+
+                if (update_loop_filter_mode_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
+                }
+            }
+            // save new deltas
+            loop_filter_ref_deltas = current_frame_info.ref_deltas;
+            loop_filter_mode_deltas = current_frame_info.mode_deltas;
+        }
+    }
+
+    uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
+
+    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
+
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+
+    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
+    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
+
+    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
+    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
+
+    // If it's less than the maximum, we need to add an extra 0 on the bitstream
+    // to indicate that it should stop reading.
+    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
+    } else {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
+    }
+
+    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
+
+    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
+
+    if (tile_rows_log2_is_nonzero) {
+        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
+    }
+
+    return uncomp_writer;
+}
+
+const std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) {
+    std::vector<u8> bitstream;
+    {
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        current_frame_info = curr_frame.info;
+        bitstream = std::move(curr_frame.bit_stream);
+    }
+
+    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
+    auto uncomp_writer = ComposeUncompressedHeader();
+    std::vector<u8> compressed_header = ComposeCompressedHeader();
+
+    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
+    uncomp_writer.Flush();
+    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
+
+    // Write headers and frame to buffer
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
+                compressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
+                bitstream.data(), bitstream.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
+    return frame;
+}
+
+VpxRangeEncoder::VpxRangeEncoder() {
+    Write(false);
+}
+
+VpxRangeEncoder::~VpxRangeEncoder() = default;
+
+void VpxRangeEncoder::Write(s32 value, s32 value_size) {
+    for (s32 bit = value_size - 1; bit >= 0; bit--) {
+        Write(((value >> bit) & 1) != 0);
+    }
+}
+
+void VpxRangeEncoder::Write(bool bit) {
+    Write(bit, half_probability);
+}
+
+void VpxRangeEncoder::Write(bool bit, s32 probability) {
+    u32 local_range = range;
+    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
+    local_range = split;
+
+    if (bit) {
+        low_value += split;
+        local_range = range - split;
+    }
+
+    s32 shift = norm_lut[local_range];
+    local_range <<= shift;
+    count += shift;
+
+    if (count >= 0) {
+        const s32 offset = shift - count;
+
+        if (((low_value << (offset - 1)) >> 31) != 0) {
+            const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
+            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+            while (PeekByte() == 0xff) {
+                base_stream.WriteByte(0);
+
+                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
+            }
+            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
+            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
+        }
+        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
+
+        low_value <<= offset;
+        shift = count;
+        low_value &= 0xffffff;
+        count -= 8;
+    }
+
+    low_value <<= shift;
+    range = local_range;
+}
+
+void VpxRangeEncoder::End() {
+    for (std::size_t index = 0; index < 32; ++index) {
+        Write(false);
+    }
+}
+
+u8 VpxRangeEncoder::PeekByte() {
+    const u8 value = base_stream.ReadByte();
+    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+
+    return value;
+}
+
+VpxBitStreamWriter::VpxBitStreamWriter() = default;
+
+VpxBitStreamWriter::~VpxBitStreamWriter() = default;
+
+void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
+    WriteBits(value, value_size);
+}
+
+void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
+    const bool sign = value < 0;
+    if (sign) {
+        value = -value;
+    }
+
+    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
+}
+
+void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
+    const bool delta_coded = value != 0;
+    WriteBit(delta_coded);
+
+    if (delta_coded) {
+        WriteBits(value, 4);
+    }
+}
+
+void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
+    s32 value_pos = 0;
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free = GetFreeBufferBits();
+
+        if (copy_size > free) {
+            copy_size = free;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void VpxBitStreamWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+s32 VpxBitStreamWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void VpxBitStreamWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+    buffer = 0;
+    buffer_pos = 0;
+}
+
+std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
+    return byte_array;
+}
+
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h
new file mode 100644
index 000000000..e2504512c
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -0,0 +1,196 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/stream.h"
+#include "video_core/command_classes/codecs/vp9_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
+namespace Decoder {
+
+/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
+/// VP9 header bitstreams.
+
+class VpxRangeEncoder {
+public:
+    VpxRangeEncoder();
+    ~VpxRangeEncoder();
+
+    VpxRangeEncoder(const VpxRangeEncoder&) = delete;
+    VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
+
+    VpxRangeEncoder(VpxRangeEncoder&&) = default;
+    VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
+
+    /// Writes the rightmost value_size bits from value into the stream
+    void Write(s32 value, s32 value_size);
+
+    /// Writes a single bit with half probability
+    void Write(bool bit);
+
+    /// Writes a bit to the base_stream encoded with probability
+    void Write(bool bit, s32 probability);
+
+    /// Signal the end of the bitstream
+    void End();
+
+    [[nodiscard]] std::vector<u8>& GetBuffer() {
+        return base_stream.GetBuffer();
+    }
+
+    [[nodiscard]] const std::vector<u8>& GetBuffer() const {
+        return base_stream.GetBuffer();
+    }
+
+private:
+    u8 PeekByte();
+    Common::Stream base_stream{};
+    u32 low_value{};
+    u32 range{0xff};
+    s32 count{-24};
+    s32 half_probability{128};
+};
+
+class VpxBitStreamWriter {
+public:
+    VpxBitStreamWriter();
+    ~VpxBitStreamWriter();
+
+    VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
+    VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
+
+    VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
+    VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
+
+    /// Write an unsigned integer value
+    void WriteU(u32 value, u32 value_size);
+
+    /// Write a signed integer value
+    void WriteS(s32 value, u32 value_size);
+
+    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
+    void WriteDeltaQ(u32 value);
+
+    /// Write a single bit.
+    void WriteBit(bool state);
+
+    /// Pushes current buffer into buffer_array, resets buffer
+    void Flush();
+
+    /// Returns byte_array
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+
+    /// Returns const byte_array
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    /// Write bit_count bits from value into buffer
+    void WriteBits(u32 value, u32 bit_count);
+
+    /// Gets next available position in buffer, invokes Flush() if buffer is full
+    s32 GetFreeBufferBits();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class VP9 {
+public:
+    explicit VP9(GPU& gpu);
+    ~VP9();
+
+    VP9(const VP9&) = delete;
+    VP9& operator=(const VP9&) = delete;
+
+    VP9(VP9&&) = default;
+    VP9& operator=(VP9&&) = delete;
+
+    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
+    /// documentation
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
+
+    /// Returns true if the most recent frame was a hidden frame.
+    [[nodiscard]] bool WasFrameHidden() const {
+        return hidden;
+    }
+
+private:
+    /// Generates compressed header probability updates in the bitstream writer
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                const std::array<T, N>& old_prob);
+
+    /// Generates compressed header probability updates in the bitstream writer
+    /// If probs are not equal, WriteProbabilityDelta is invoked
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Generates compressed header probability deltas in the bitstream writer
+    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Inverse of 6.3.4 Decode term subexp
+    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
+
+    /// Writes if the value is less than the test value
+    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
+
+    /// Writes probability updates for the Coef probabilities
+    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                    const std::array<u8, 2304>& new_prob,
+                                    const std::array<u8, 2304>& old_prob);
+
+    /// Write probabilities for 4-byte aligned structures
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                        const std::array<T, N>& old_prob);
+
+    /// Write motion vector probability updates. 6.3.17 in the spec
+    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Returns VP9 information from NVDEC provided offset and size
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
+
+    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
+    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
+
+    /// Returns frame to be decoded after buffering
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
+
+    /// Use NVDEC providied information to compose the headers for the current frame
+    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
+    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
+
+    GPU& gpu;
+    std::vector<u8> frame;
+
+    std::array<s8, 4> loop_filter_ref_deltas{};
+    std::array<s8, 2> loop_filter_mode_deltas{};
+
+    bool hidden = false;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
+    Vp9FrameContainer next_frame{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};
+
+    Vp9PictureInfo current_frame_info{};
+    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h
new file mode 100644
index 000000000..4f0b05d22
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -0,0 +1,366 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+
+namespace Decoder {
+struct Vp9FrameDimensions {
+    s16 width{};
+    s16 height{};
+    s16 luma_pitch{};
+    s16 chroma_pitch{};
+};
+static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
+
+enum FrameFlags : u32 {
+    IsKeyFrame = 1 << 0,
+    LastFrameIsKeyFrame = 1 << 1,
+    FrameSizeChanged = 1 << 2,
+    ErrorResilientMode = 1 << 3,
+    LastShowFrame = 1 << 4,
+    IntraOnly = 1 << 5,
+};
+
+enum class MvJointType {
+    MvJointZero = 0,   /* Zero vector */
+    MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
+    MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
+    MvJointHnzvnz = 3, /* Both components nonzero */
+};
+enum class MvClassType {
+    MvClass0 = 0,   /* (0, 2]     integer pel */
+    MvClass1 = 1,   /* (2, 4]     integer pel */
+    MvClass2 = 2,   /* (4, 8]     integer pel */
+    MvClass3 = 3,   /* (8, 16]    integer pel */
+    MvClass4 = 4,   /* (16, 32]   integer pel */
+    MvClass5 = 5,   /* (32, 64]   integer pel */
+    MvClass6 = 6,   /* (64, 128]  integer pel */
+    MvClass7 = 7,   /* (128, 256] integer pel */
+    MvClass8 = 8,   /* (256, 512] integer pel */
+    MvClass9 = 9,   /* (512, 1024] integer pel */
+    MvClass10 = 10, /* (1024,2048] integer pel */
+};
+
+enum class BlockSize {
+    Block4x4 = 0,
+    Block4x8 = 1,
+    Block8x4 = 2,
+    Block8x8 = 3,
+    Block8x16 = 4,
+    Block16x8 = 5,
+    Block16x16 = 6,
+    Block16x32 = 7,
+    Block32x16 = 8,
+    Block32x32 = 9,
+    Block32x64 = 10,
+    Block64x32 = 11,
+    Block64x64 = 12,
+    BlockSizes = 13,
+    BlockInvalid = BlockSizes
+};
+
+enum class PredictionMode {
+    DcPred = 0,   // Average of above and left pixels
+    VPred = 1,    // Vertical
+    HPred = 2,    // Horizontal
+    D45Pred = 3,  // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
+    D135Pred = 4, // Directional 135 deg = 180 - 45
+    D117Pred = 5, // Directional 117 deg = 180 - 63
+    D153Pred = 6, // Directional 153 deg = 180 - 27
+    D207Pred = 7, // Directional 207 deg = 180 + 27
+    D63Pred = 8,  // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
+    TmPred = 9,   // True-motion
+    NearestMv = 10,
+    NearMv = 11,
+    ZeroMv = 12,
+    NewMv = 13,
+    MbModeCount = 14
+};
+
+enum class TxSize {
+    Tx4x4 = 0,   // 4x4 transform
+    Tx8x8 = 1,   // 8x8 transform
+    Tx16x16 = 2, // 16x16 transform
+    Tx32x32 = 3, // 32x32 transform
+    TxSizes = 4
+};
+
+enum class TxMode {
+    Only4X4 = 0,      // Only 4x4 transform used
+    Allow8X8 = 1,     // Allow block transform size up to 8x8
+    Allow16X16 = 2,   // Allow block transform size up to 16x16
+    Allow32X32 = 3,   // Allow block transform size up to 32x32
+    TxModeSelect = 4, // Transform specified for each block
+    TxModes = 5
+};
+
+enum class reference_mode {
+    SingleReference = 0,
+    CompoundReference = 1,
+    ReferenceModeSelect = 2,
+    ReferenceModes = 3
+};
+
+struct Segmentation {
+    u8 enabled{};
+    u8 update_map{};
+    u8 temporal_update{};
+    u8 abs_delta{};
+    std::array<u32, 8> feature_mask{};
+    std::array<std::array<s16, 4>, 8> feature_data{};
+};
+static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
+
+struct LoopFilter {
+    u8 mode_ref_delta_enabled{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+};
+static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
+
+struct Vp9EntropyProbs {
+    std::array<u8, 36> y_mode_prob{};
+    std::array<u8, 64> partition_prob{};
+    std::array<u8, 2304> coef_probs{};
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 3> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 20> prob_bits{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+};
+static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size");
+
+struct Vp9PictureInfo {
+    bool is_key_frame{};
+    bool intra_only{};
+    bool last_frame_was_key{};
+    bool frame_size_changed{};
+    bool error_resilient_mode{};
+    bool last_frame_shown{};
+    bool show_frame{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    s32 base_q_index{};
+    s32 y_dc_delta_q{};
+    s32 uv_dc_delta_q{};
+    s32 uv_ac_delta_q{};
+    bool lossless{};
+    s32 transform_mode{};
+    bool allow_high_precision_mv{};
+    s32 interp_filter{};
+    s32 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    s32 log2_tile_cols{};
+    s32 log2_tile_rows{};
+    bool segment_enabled{};
+    bool segment_map_update{};
+    bool segment_map_temporal_update{};
+    s32 segment_abs_delta{};
+    std::array<u32, 8> segment_feature_enable{};
+    std::array<std::array<s16, 4>, 8> segment_feature_data{};
+    bool mode_ref_delta_enabled{};
+    bool use_prev_in_find_mv_refs{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+    Vp9EntropyProbs entropy{};
+    Vp9FrameDimensions frame_size{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u32 bitstream_size{};
+    std::array<u64, 4> frame_offsets{};
+    std::array<bool, 4> refresh_frame{};
+};
+
+struct Vp9FrameContainer {
+    Vp9PictureInfo info{};
+    std::vector<u8> bit_stream;
+};
+
+struct PictureInfo {
+    INSERT_PADDING_WORDS(12);
+    u32 bitstream_size{};
+    INSERT_PADDING_WORDS(5);
+    Vp9FrameDimensions last_frame_size{};
+    Vp9FrameDimensions golden_frame_size{};
+    Vp9FrameDimensions alt_frame_size{};
+    Vp9FrameDimensions current_frame_size{};
+    u32 vp9_flags{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u8 base_q_index{};
+    u8 y_dc_delta_q{};
+    u8 uv_ac_delta_q{};
+    u8 uv_dc_delta_q{};
+    u8 lossless{};
+    u8 tx_mode{};
+    u8 allow_high_precision_mv{};
+    u8 interp_filter{};
+    u8 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    u8 log2_tile_cols{};
+    u8 log2_tile_rows{};
+    Segmentation segmentation{};
+    LoopFilter loop_filter{};
+    INSERT_PADDING_BYTES(5);
+    u32 surface_params{};
+    INSERT_PADDING_WORDS(3);
+
+    [[nodiscard]] Vp9PictureInfo Convert() const {
+        return {
+            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
+            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
+            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
+            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
+            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
+            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
+            .ref_frame_sign_bias = ref_frame_sign_bias,
+            .base_q_index = base_q_index,
+            .y_dc_delta_q = y_dc_delta_q,
+            .uv_dc_delta_q = uv_dc_delta_q,
+            .uv_ac_delta_q = uv_ac_delta_q,
+            .lossless = lossless != 0,
+            .transform_mode = tx_mode,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .interp_filter = interp_filter,
+            .reference_mode = reference_mode,
+            .comp_fixed_ref = comp_fixed_ref,
+            .comp_var_ref = comp_var_ref,
+            .log2_tile_cols = log2_tile_cols,
+            .log2_tile_rows = log2_tile_rows,
+            .segment_enabled = segmentation.enabled != 0,
+            .segment_map_update = segmentation.update_map != 0,
+            .segment_map_temporal_update = segmentation.temporal_update != 0,
+            .segment_abs_delta = segmentation.abs_delta,
+            .segment_feature_enable = segmentation.feature_mask,
+            .segment_feature_data = segmentation.feature_data,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
+            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
+                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
+                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
+                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
+                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
+            .ref_deltas = loop_filter.ref_deltas,
+            .mode_deltas = loop_filter.mode_deltas,
+            .frame_size = current_frame_size,
+            .first_level = first_level,
+            .sharpness_level = sharpness_level,
+            .bitstream_size = bitstream_size,
+        };
+    }
+};
+static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
+
+struct EntropyProbs {
+    INSERT_PADDING_BYTES(1024);
+    std::array<std::array<u8, 4>, 7> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    INSERT_PADDING_BYTES(80);
+    std::array<std::array<u8, 1>, 2> tx_8x8_prob{};
+    std::array<std::array<u8, 2>, 2> tx_16x16_prob{};
+    std::array<std::array<u8, 3>, 2> tx_32x32_prob{};
+    std::array<u8, 4> y_mode_prob_e8{};
+    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
+    INSERT_PADDING_BYTES(64);
+    std::array<std::array<u8, 4>, 16> partition_prob{};
+    INSERT_PADDING_BYTES(10);
+    std::array<std::array<u8, 2>, 4> switchable_interp_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 4> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<std::array<u8, 1>, 2> class_0{};
+    std::array<std::array<u8, 3>, 2> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+    std::array<std::array<u8, 10>, 2> classes{};
+    std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{};
+    std::array<std::array<u8, 10>, 2> pred_bits{};
+    std::array<std::array<u8, 2>, 5> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    INSERT_PADDING_BYTES(17);
+    std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4>
+        coef_probs{};
+
+    void Convert(Vp9EntropyProbs& fc) {
+        std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size());
+
+        std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(),
+                    fc.intra_inter_prob.size());
+
+        std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size());
+        std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size());
+        std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size());
+
+        for (s32 i = 0; i < 4; i++) {
+            for (s32 j = 0; j < 9; j++) {
+                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
+            }
+        }
+
+        std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size());
+
+        std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(),
+                    fc.switchable_interp_prob.size());
+        std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size());
+        std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size());
+
+        std::memcpy(fc.joints.data(), joints.data(), fc.joints.size());
+
+        std::memcpy(fc.sign.data(), sign.data(), fc.sign.size());
+        std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size());
+        std::memcpy(fc.fr.data(), fr.data(), fc.fr.size());
+        std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size());
+        std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size());
+        std::memcpy(fc.classes.data(), classes.data(), fc.classes.size());
+        std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size());
+        std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size());
+        std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size());
+        std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size());
+
+        std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size());
+    }
+};
+static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
+
+enum class Ref { Last, Golden, AltRef };
+
+struct RefPoolElement {
+    s64 frame{};
+    Ref ref{};
+    bool refresh{};
+};
+
+struct FrameContexts {
+    s64 from{};
+    bool adapted{};
+    Vp9EntropyProbs probs{};
+};
+
+}; // namespace Decoder
+}; // namespace Tegra
diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp
new file mode 100644
index 000000000..c4dd4881a
--- /dev/null
+++ b/src/video_core/command_classes/host1x.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/host1x.h"
+#include "video_core/gpu.h"
+
+Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
+
+Tegra::Host1x::~Host1x() = default;
+
+void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    StateWrite(static_cast<u32>(method), arguments[0]);
+    switch (method) {
+    case Method::WaitSyncpt:
+        Execute(arguments[0]);
+        break;
+    case Method::LoadSyncptPayload32:
+        syncpoint_value = arguments[0];
+        break;
+    case Method::WaitSyncpt32:
+        Execute(arguments[0]);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
+        break;
+    }
+}
+
+void Tegra::Host1x::Execute(u32 data) {
+    // This method waits on a valid syncpoint.
+    // TODO: Implement when proper Async is in place
+}
diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h
new file mode 100644
index 000000000..013eaa0c1
--- /dev/null
+++ b/src/video_core/command_classes/host1x.h
@@ -0,0 +1,78 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+class Host1x {
+public:
+    struct Host1xClassRegisters {
+        u32 incr_syncpt{};
+        u32 incr_syncpt_ctrl{};
+        u32 incr_syncpt_error{};
+        INSERT_PADDING_WORDS(5);
+        u32 wait_syncpt{};
+        u32 wait_syncpt_base{};
+        u32 wait_syncpt_incr{};
+        u32 load_syncpt_base{};
+        u32 incr_syncpt_base{};
+        u32 clear{};
+        u32 wait{};
+        u32 wait_with_interrupt{};
+        u32 delay_use{};
+        u32 tick_count_high{};
+        u32 tick_count_low{};
+        u32 tick_ctrl{};
+        INSERT_PADDING_WORDS(23);
+        u32 ind_ctrl{};
+        u32 ind_off2{};
+        u32 ind_off{};
+        std::array<u32, 31> ind_data{};
+        INSERT_PADDING_WORDS(1);
+        u32 load_syncpoint_payload32{};
+        u32 stall_ctrl{};
+        u32 wait_syncpt32{};
+        u32 wait_syncpt_base32{};
+        u32 load_syncpt_base32{};
+        u32 incr_syncpt_base32{};
+        u32 stall_count_high{};
+        u32 stall_count_low{};
+        u32 xref_ctrl{};
+        u32 channel_xref_high{};
+        u32 channel_xref_low{};
+    };
+    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
+
+    enum class Method : u32 {
+        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
+        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
+        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
+    };
+
+    explicit Host1x(GPU& gpu);
+    ~Host1x();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    /// For Host1x, execute is waiting on a syncpoint previously written into the state
+    void Execute(u32 data);
+
+    /// Write argument into the provided offset
+    void StateWrite(u32 offset, u32 arguments);
+
+    u32 syncpoint_value{};
+    Host1xClassRegisters state{};
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp
new file mode 100644
index 000000000..8ca7a7b06
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -0,0 +1,52 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+
+Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
+
+Nvdec::~Nvdec() = default;
+
+void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    if (method == Method::SetVideoCodec) {
+        codec->StateWrite(static_cast<u32>(method), arguments[0]);
+    } else {
+        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
+    }
+
+    switch (method) {
+    case Method::SetVideoCodec:
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
+        break;
+    case Method::Execute:
+        Execute();
+        break;
+    }
+}
+
+AVFrame* Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
+}
+
+const AVFrame* Nvdec::GetFrame() const {
+    return codec->GetCurrentFrame();
+}
+
+void Nvdec::Execute() {
+    switch (codec->GetCurrentCodec()) {
+    case NvdecCommon::VideoCodec::H264:
+    case NvdecCommon::VideoCodec::Vp9:
+        codec->Decode();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h
new file mode 100644
index 000000000..eec4443f9
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/command_classes/codecs/codec.h"
+
+namespace Tegra {
+class GPU;
+
+class Nvdec {
+public:
+    enum class Method : u32 {
+        SetVideoCodec = 0x80,
+        Execute = 0xc0,
+    };
+
+    explicit Nvdec(GPU& gpu);
+    ~Nvdec();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+    /// Return most recently decoded frame
+    [[nodiscard]] AVFrame* GetFrame();
+    [[nodiscard]] const AVFrame* GetFrame() const;
+
+private:
+    /// Invoke codec to decode a frame
+    void Execute();
+
+    GPU& gpu;
+    std::unique_ptr<Codec> codec;
+};
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h
new file mode 100644
index 000000000..01b5e086d
--- /dev/null
+++ b/src/video_core/command_classes/nvdec_common.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::NvdecCommon {
+
+struct NvdecRegisters {
+    INSERT_PADDING_WORDS(256);
+    u64 set_codec_id{};
+    INSERT_PADDING_WORDS(254);
+    u64 set_platform_id{};
+    u64 picture_info_offset{};
+    u64 frame_bitstream_offset{};
+    u64 frame_number{};
+    u64 h264_slice_data_offsets{};
+    u64 h264_mv_dump_offset{};
+    INSERT_PADDING_WORDS(6);
+    u64 frame_stats_offset{};
+    u64 h264_last_surface_luma_offset{};
+    u64 h264_last_surface_chroma_offset{};
+    std::array<u64, 17> surface_luma_offset{};
+    std::array<u64, 17> surface_chroma_offset{};
+    INSERT_PADDING_WORDS(132);
+    u64 vp9_entropy_probs_offset{};
+    u64 vp9_backward_updates_offset{};
+    u64 vp9_last_frame_segmap_offset{};
+    u64 vp9_curr_frame_segmap_offset{};
+    INSERT_PADDING_WORDS(2);
+    u64 vp9_last_frame_mvs_offset{};
+    u64 vp9_curr_frame_mvs_offset{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
+
+enum class VideoCodec : u32 {
+    None = 0x0,
+    H264 = 0x3,
+    Vp8 = 0x5,
+    H265 = 0x7,
+    Vp9 = 0x9,
+};
+
+} // namespace Tegra::NvdecCommon
diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp
new file mode 100644
index 000000000..19dc9e0ab
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.cpp
@@ -0,0 +1,60 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <algorithm>
+#include "sync_manager.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
+SyncptIncrManager::~SyncptIncrManager() = default;
+
+void SyncptIncrManager::Increment(u32 id) {
+    increments.emplace_back(0, 0, id, true);
+    IncrementAllDone();
+}
+
+u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
+    const u32 handle = current_id++;
+    increments.emplace_back(handle, class_id, id);
+    return handle;
+}
+
+void SyncptIncrManager::SignalDone(u32 handle) {
+    const auto done_incr =
+        std::find_if(increments.begin(), increments.end(),
+                     [handle](const SyncptIncr& incr) { return incr.id == handle; });
+    if (done_incr != increments.cend()) {
+        done_incr->complete = true;
+    }
+    IncrementAllDone();
+}
+
+void SyncptIncrManager::IncrementAllDone() {
+    std::size_t done_count = 0;
+    for (; done_count < increments.size(); ++done_count) {
+        if (!increments[done_count].complete) {
+            break;
+        }
+        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
+    }
+    increments.erase(increments.begin(), increments.begin() + done_count);
+}
+} // namespace Tegra
diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h
new file mode 100644
index 000000000..2c321ec58
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.h
@@ -0,0 +1,64 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+struct SyncptIncr {
+    u32 id;
+    u32 class_id;
+    u32 syncpt_id;
+    bool complete;
+
+    SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
+        : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
+};
+
+class SyncptIncrManager {
+public:
+    explicit SyncptIncrManager(GPU& gpu);
+    ~SyncptIncrManager();
+
+    /// Add syncpoint id and increment all
+    void Increment(u32 id);
+
+    /// Returns a handle to increment later
+    u32 IncrementWhenDone(u32 class_id, u32 id);
+
+    /// IncrememntAllDone, including handle
+    void SignalDone(u32 handle);
+
+    /// Increment all sequential pending increments that are already done.
+    void IncrementAllDone();
+
+private:
+    std::vector<SyncptIncr> increments;
+    std::mutex increment_lock;
+    u32 current_id{};
+
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
new file mode 100644
index 000000000..5b52da277
--- /dev/null
+++ b/src/video_core/command_classes/vic.cpp
@@ -0,0 +1,180 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/command_classes/vic.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/texture_cache/surface_params.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace Tegra {
+
+Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
+    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
+Vic::~Vic() = default;
+
+void Vic::VicStateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
+    VicStateWrite(static_cast<u32>(method), arguments[0]);
+    const u64 arg = static_cast<u64>(arguments[0]) << 8;
+    switch (method) {
+    case Method::Execute:
+        Execute();
+        break;
+    case Method::SetConfigStructOffset:
+        config_struct_address = arg;
+        break;
+    case Method::SetOutputSurfaceLumaOffset:
+        output_surface_luma_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaUOffset:
+        output_surface_chroma_u_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaVOffset:
+        output_surface_chroma_v_address = arg;
+        break;
+    default:
+        break;
+    }
+}
+
+void Vic::Execute() {
+    if (output_surface_luma_address == 0) {
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
+                  vic_state.output_surface.luma_offset);
+        return;
+    }
+    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
+    const VideoPixelFormat pixel_format =
+        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    switch (pixel_format) {
+    case VideoPixelFormat::BGRA8:
+    case VideoPixelFormat::RGBA8: {
+        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+        if (scaler_ctx == nullptr || frame->width != scaler_width ||
+            frame->height != scaler_height) {
+            const AVPixelFormat target_format =
+                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+
+            sws_freeContext(scaler_ctx);
+            scaler_ctx = nullptr;
+
+            // FFmpeg returns all frames in YUV420, convert it into expected format
+            scaler_ctx =
+                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+
+            scaler_width = frame->width;
+            scaler_height = frame->height;
+        }
+        // Get Converted frame
+        const std::size_t linear_size = frame->width * frame->height * 4;
+
+        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
+
+        const int converted_stride{frame->width * 4};
+        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+
+        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
+                  &converted_frame_buf_addr, &converted_stride);
+
+        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+        if (blk_kind != 0) {
+            // swizzle pitch linear to block linear
+            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+                                                            block_height, 0);
+            std::vector<u8> swizzled_data(size);
+            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
+                                             swizzled_data.data(), converted_frame_buffer.get(),
+                                             false, block_height, 0, 1);
+
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        } else {
+            // send pitch linear frame
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                           linear_size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case VideoPixelFormat::Yuv420: {
+        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+
+        const std::size_t surface_width = config.surface_width_minus1 + 1;
+        const std::size_t surface_height = config.surface_height_minus1 + 1;
+        const std::size_t half_width = surface_width / 2;
+        const std::size_t half_height = config.surface_height_minus1 / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+
+        const auto* luma_ptr = frame->data[0];
+        const auto* chroma_b_ptr = frame->data[1];
+        const auto* chroma_r_ptr = frame->data[2];
+        const auto stride = frame->linesize[0];
+        const auto half_stride = frame->linesize[1];
+
+        std::vector<u8> luma_buffer(aligned_width * surface_height);
+        std::vector<u8> chroma_buffer(aligned_width * half_height);
+
+        // Populate luma buffer
+        for (std::size_t y = 0; y < surface_height - 1; ++y) {
+            std::size_t src = y * stride;
+            std::size_t dst = y * aligned_width;
+
+            std::size_t size = surface_width;
+
+            for (std::size_t offset = 0; offset < size; ++offset) {
+                luma_buffer[dst + offset] = luma_ptr[src + offset];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                       luma_buffer.size());
+
+        // Populate chroma buffer from both channels with interleaving.
+        for (std::size_t y = 0; y < half_height; ++y) {
+            std::size_t src = y * half_stride;
+            std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+                                       chroma_buffer.size());
+        gpu.Maxwell3D().OnMemoryWrite();
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
new file mode 100644
index 000000000..8c4e284a1
--- /dev/null
+++ b/src/video_core/command_classes/vic.h
@@ -0,0 +1,110 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+struct SwsContext;
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+struct PlaneOffsets {
+    u32 luma_offset{};
+    u32 chroma_u_offset{};
+    u32 chroma_v_offset{};
+};
+
+struct VicRegisters {
+    INSERT_PADDING_WORDS(64);
+    u32 nop{};
+    INSERT_PADDING_WORDS(15);
+    u32 pm_trigger{};
+    INSERT_PADDING_WORDS(47);
+    u32 set_application_id{};
+    u32 set_watchdog_timer{};
+    INSERT_PADDING_WORDS(17);
+    u32 context_save_area{};
+    u32 context_switch{};
+    INSERT_PADDING_WORDS(43);
+    u32 execute{};
+    INSERT_PADDING_WORDS(63);
+    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
+    u32 picture_index{};
+    u32 control_params{};
+    u32 config_struct_offset{};
+    u32 filter_struct_offset{};
+    u32 palette_offset{};
+    u32 hist_offset{};
+    u32 context_id{};
+    u32 fce_ucode_size{};
+    PlaneOffsets output_surface{};
+    u32 fce_ucode_offset{};
+    INSERT_PADDING_WORDS(4);
+    std::array<u32, 8> slot_context_id{};
+    INSERT_PADDING_WORDS(16);
+};
+static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
+
+class Vic {
+public:
+    enum class Method : u32 {
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaVOffset = 0x1ca
+    };
+
+    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+    ~Vic();
+
+    /// Write to the device state.
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    void Execute();
+
+    void VicStateWrite(u32 offset, u32 arguments);
+    VicRegisters vic_state{};
+
+    enum class VideoPixelFormat : u64_le {
+        RGBA8 = 0x1f,
+        BGRA8 = 0x20,
+        Yuv420 = 0x44,
+    };
+
+    union VicConfig {
+        u64_le raw{};
+        BitField<0, 7, u64_le> pixel_format;
+        BitField<7, 2, u64_le> chroma_loc_horiz;
+        BitField<9, 2, u64_le> chroma_loc_vert;
+        BitField<11, 4, u64_le> block_linear_kind;
+        BitField<15, 4, u64_le> block_linear_height_log2;
+        BitField<19, 3, u64_le> reserved0;
+        BitField<22, 10, u64_le> reserved1;
+        BitField<32, 14, u64_le> surface_width_minus1;
+        BitField<46, 14, u64_le> surface_height_minus1;
+    };
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_v_address{};
+
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index f2f96ac33..d8801b1f5 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/cityhash.h"
 #include "common/microprofile.h"
 #include "core/core.h"
 #include "core/memory.h"
@@ -45,32 +46,41 @@ bool DmaPusher::Step() {
         return false;
     }
 
-    const CommandList& command_list{dma_pushbuffer.front()};
-    ASSERT_OR_EXECUTE(!command_list.empty(), {
-        // Somehow the command_list is empty, in order to avoid a crash
-        // We ignore it and assume its size is 0.
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-        return true;
-    });
-    const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
-    const GPUVAddr dma_get = command_list_header.addr;
-
-    if (dma_pushbuffer_subindex >= command_list.size()) {
-        // We've gone through the current list, remove it from the queue
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-    }
+    CommandList& command_list{dma_pushbuffer.front()};
 
-    if (command_list_header.size == 0) {
-        return true;
-    }
+    ASSERT_OR_EXECUTE(
+        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
+            // Somehow the command_list is empty, in order to avoid a crash
+            // We ignore it and assume its size is 0.
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+            return true;
+        });
+
+    if (command_list.prefetch_command_list.size()) {
+        // Prefetched command list from nvdrv, used for things like synchronization
+        command_headers = std::move(command_list.prefetch_command_list);
+        dma_pushbuffer.pop();
+    } else {
+        const CommandListHeader command_list_header{
+            command_list.command_lists[dma_pushbuffer_subindex++]};
+        const GPUVAddr dma_get = command_list_header.addr;
+
+        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
 
-    // Push buffer non-empty, read a word
-    command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
-                                        command_list_header.size * sizeof(u32));
+        if (command_list_header.size == 0) {
+            return true;
+        }
 
+        // Push buffer non-empty, read a word
+        command_headers.resize(command_list_header.size);
+        gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                            command_list_header.size * sizeof(u32));
+    }
     for (std::size_t index = 0; index < command_headers.size();) {
         const CommandHeader& command_header = command_headers[index];
 
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index efa90d170..96ac267f7 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -18,6 +18,8 @@ class System;
 
 namespace Tegra {
 
+class GPU;
+
 enum class SubmissionMode : u32 {
     IncreasingOld = 0,
     Increasing = 1,
@@ -27,6 +29,31 @@ enum class SubmissionMode : u32 {
     IncreaseOnce = 5
 };
 
+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
+enum class BufferMethods : u32 {
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    UnkCacheFlush = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
+    WaitForInterrupt = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
 struct CommandListHeader {
     union {
         u64 raw;
@@ -49,9 +76,23 @@ union CommandHeader {
 static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
 
-class GPU;
+inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
+    CommandHeader result{};
+    result.method.Assign(static_cast<u32>(method));
+    result.arg_count.Assign(arg_count);
+    result.mode.Assign(mode);
+    return result;
+}
 
-using CommandList = std::vector<Tegra::CommandListHeader>;
+struct CommandList final {
+    CommandList() = default;
+    explicit CommandList(std::size_t size) : command_lists(size) {}
+    explicit CommandList(std::vector<Tegra::CommandHeader>&& prefetch_command_list)
+        : prefetch_command_list{std::move(prefetch_command_list)} {}
+
+    std::vector<Tegra::CommandListHeader> command_lists;
+    std::vector<Tegra::CommandHeader> prefetch_command_list;
+};
 
 /**
  * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
@@ -60,7 +101,7 @@ using CommandList = std::vector<Tegra::CommandListHeader>;
  * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
  * details on this implementation.
  */
-class DmaPusher {
+class DmaPusher final {
 public:
     explicit DmaPusher(Core::System& system, GPU& gpu);
     ~DmaPusher();
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 6e50661a3..9409c4075 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -87,12 +87,12 @@ void Fermi2D::HandleSurfaceCopy() {
     const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
     const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
                                           dst_blit_y2};
-    Config copy_config;
-    copy_config.operation = regs.operation;
-    copy_config.filter = regs.blit_control.filter;
-    copy_config.src_rect = src_rect;
-    copy_config.dst_rect = dst_rect;
-
+    const Config copy_config{
+        .operation = regs.operation,
+        .filter = regs.blit_control.filter,
+        .src_rect = src_rect,
+        .dst_rect = dst_rect,
+    };
     if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
         UNIMPLEMENTED();
     }
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 213abfaae..0909709ec 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -145,8 +145,8 @@ public:
     } regs{};
 
     struct Config {
-        Operation operation;
-        Filter filter;
+        Operation operation{};
+        Filter filter{};
         Common::Rectangle<u32> src_rect;
         Common::Rectangle<u32> dst_rect;
     };
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 33854445f..6287df633 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -124,6 +124,112 @@ void Maxwell3D::InitializeRegisterDefaults() {
     mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }
 
+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    }
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
     // Reset the current macro.
     executing_macro = 0;
@@ -157,142 +263,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     // Methods after 0xE00 are special, they're actually triggers for some microcode that was
     // uploaded to the GPU during initialization.
     if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        macro_params.push_back(method_argument);
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, &method_argument, 1, is_last_call);
         return;
     }
 
     ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid Maxwell3D register, increase the size of the Regs structure");
 
-    u32 arg = method_argument;
-    // Keep track of the register value in shadow_state when requested.
-    if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track ||
-        shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) {
-        shadow_state.reg_array[method] = arg;
-    } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) {
-        arg = shadow_state.reg_array[method];
-    }
-
-    if (regs.reg_array[method] != arg) {
-        regs.reg_array[method] = arg;
-
-        for (const auto& table : dirty.tables) {
-            dirty.flags[table[method]] = true;
-        }
-    }
-
-    switch (method) {
-    case MAXWELL3D_REG_INDEX(wait_for_idle): {
-        rasterizer->WaitForIdle();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(shadow_ram_control): {
-        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.data): {
-        macro_engine->AddCode(regs.macros.upload_address, arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.bind): {
-        ProcessMacroBind(arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(firmware[4]): {
-        ProcessFirmwareCall4();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        StartCBData(method);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[0]): {
-        ProcessCBBind(0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[1]): {
-        ProcessCBBind(1);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[2]): {
-        ProcessCBBind(2);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[3]): {
-        ProcessCBBind(3);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[4]): {
-        ProcessCBBind(4);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
-        DrawArrays();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(clear_buffers): {
-        ProcessClearBuffers();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(query.query_get): {
-        ProcessQueryGet();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(condition.mode): {
-        ProcessQueryCondition();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(counter_reset): {
-        ProcessCounterReset();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(sync_info): {
-        ProcessSyncPoint();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(exec_upload): {
-        upload_state.ProcessExec(regs.exec_upload.linear != 0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(data_upload): {
-        upload_state.ProcessData(arg, is_last_call);
-        if (is_last_call) {
-            OnMemoryWrite();
-        }
-        break;
-    }
-    default:
-        break;
-    }
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
 }
 
 void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
@@ -300,23 +280,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
     // Methods after 0xE00 are special, they're actually triggers for some microcode that was
     // uploaded to the GPU during initialization.
     if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        for (std::size_t i = 0; i < amount; i++) {
-            macro_params.push_back(base_start[i]);
-        }
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
         return;
     }
     switch (method) {
@@ -335,15 +299,14 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
         ProcessCBMultiData(method, base_start, amount);
         break;
-    }
-    default: {
+    default:
         for (std::size_t i = 0; i < amount; i++) {
             CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
         }
-    }
+        break;
     }
 }
 
@@ -597,7 +560,7 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
         // Deferred.
         rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
                           system.GPU().GetTicks());
-        return {};
+        return std::nullopt;
     default:
         LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
                   static_cast<u32>(regs.query.query_get.select.Value()));
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index bc289c55d..1cbe8fe67 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1461,6 +1461,14 @@ public:
 private:
     void InitializeRegisterDefaults();
 
+    void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call);
+
+    u32 ProcessShadowRam(u32 method, u32 argument);
+
+    void ProcessDirtyRegisters(u32 method, u32 argument);
+
+    void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
+
     Core::System& system;
     MemoryManager& memory_manager;
 
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index e88290754..8fa359d0a 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -114,8 +114,6 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     const u32 block_depth = src_params.block_size.depth;
     const size_t src_size =
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
-    const size_t src_layer_size =
-        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
 
     if (read_buffer.size() < src_size) {
         read_buffer.resize(src_size);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index d374b73cf..37d17efdc 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -32,31 +32,31 @@ struct Register {
 
     constexpr Register() = default;
 
-    constexpr Register(u64 value) : value(value) {}
+    constexpr Register(u64 value_) : value(value_) {}
 
-    constexpr operator u64() const {
+    [[nodiscard]] constexpr operator u64() const {
         return value;
     }
 
     template <typename T>
-    constexpr u64 operator-(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator-(const T& oth) const {
         return value - oth;
     }
 
     template <typename T>
-    constexpr u64 operator&(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const T& oth) const {
         return value & oth;
     }
 
-    constexpr u64 operator&(const Register& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const Register& oth) const {
         return value & oth.value;
     }
 
-    constexpr u64 operator~() const {
+    [[nodiscard]] constexpr u64 operator~() const {
         return ~value;
     }
 
-    u64 GetSwizzledIndex(u64 elem) const {
+    [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const {
         elem = (value + elem) & 3;
         return (value & ~3) + elem;
     }
@@ -75,7 +75,7 @@ enum class AttributeSize : u64 {
 union Attribute {
     Attribute() = default;
 
-    constexpr explicit Attribute(u64 value) : value(value) {}
+    constexpr explicit Attribute(u64 value_) : value(value_) {}
 
     enum class Index : u64 {
         LayerViewportPointSize = 6,
@@ -107,7 +107,7 @@ union Attribute {
         BitField<31, 1, u64> patch;
         BitField<47, 3, AttributeSize> size;
 
-        bool IsPhysical() const {
+        [[nodiscard]] bool IsPhysical() const {
             return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0;
         }
     } fmt20;
@@ -124,7 +124,7 @@ union Attribute {
 union Sampler {
     Sampler() = default;
 
-    constexpr explicit Sampler(u64 value) : value(value) {}
+    constexpr explicit Sampler(u64 value_) : value(value_) {}
 
     enum class Index : u64 {
         Sampler_0 = 8,
@@ -137,7 +137,7 @@ union Sampler {
 union Image {
     Image() = default;
 
-    constexpr explicit Image(u64 value) : value{value} {}
+    constexpr explicit Image(u64 value_) : value{value_} {}
 
     BitField<36, 13, u64> index;
     u64 value;
@@ -505,14 +505,14 @@ struct IpaMode {
     IpaInterpMode interpolation_mode;
     IpaSampleMode sampling_mode;
 
-    bool operator==(const IpaMode& a) const {
+    [[nodiscard]] bool operator==(const IpaMode& a) const {
         return std::tie(interpolation_mode, sampling_mode) ==
                std::tie(a.interpolation_mode, a.sampling_mode);
     }
-    bool operator!=(const IpaMode& a) const {
+    [[nodiscard]] bool operator!=(const IpaMode& a) const {
         return !operator==(a);
     }
-    bool operator<(const IpaMode& a) const {
+    [[nodiscard]] bool operator<(const IpaMode& a) const {
         return std::tie(interpolation_mode, sampling_mode) <
                std::tie(a.interpolation_mode, a.sampling_mode);
     }
@@ -658,10 +658,10 @@ union Instruction {
         return *this;
     }
 
-    constexpr Instruction(u64 value) : value{value} {}
+    constexpr Instruction(u64 value_) : value{value_} {}
     constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
-    constexpr bool Bit(u64 offset) const {
+    [[nodiscard]] constexpr bool Bit(u64 offset) const {
         return ((value >> offset) & 1) != 0;
     }
 
@@ -746,34 +746,34 @@ union Instruction {
             BitField<28, 8, u64> imm_lut28;
             BitField<48, 8, u64> imm_lut48;
 
-            u32 GetImmLut28() const {
+            [[nodiscard]] u32 GetImmLut28() const {
                 return static_cast<u32>(imm_lut28);
             }
 
-            u32 GetImmLut48() const {
+            [[nodiscard]] u32 GetImmLut48() const {
                 return static_cast<u32>(imm_lut48);
             }
         } lop3;
 
-        u16 GetImm20_16() const {
+        [[nodiscard]] u16 GetImm20_16() const {
             return static_cast<u16>(imm20_16);
         }
 
-        u32 GetImm20_19() const {
+        [[nodiscard]] u32 GetImm20_19() const {
             u32 imm{static_cast<u32>(imm20_19)};
             imm <<= 12;
             imm |= negate_imm ? 0x80000000 : 0;
             return imm;
         }
 
-        u32 GetImm20_32() const {
+        [[nodiscard]] u32 GetImm20_32() const {
             return static_cast<u32>(imm20_32);
         }
 
-        s32 GetSignedImm20_20() const {
-            u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
+        [[nodiscard]] s32 GetSignedImm20_20() const {
+            const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
             // Sign extend the 20-bit value.
-            u32 mask = 1U << (20 - 1);
+            const auto mask = 1U << (20 - 1);
             return static_cast<s32>((immediate ^ mask) - mask);
         }
     } alu;
@@ -857,7 +857,7 @@ union Instruction {
         BitField<56, 1, u64> second_negate;
         BitField<30, 9, u64> second;
 
-        u32 PackImmediates() const {
+        [[nodiscard]] u32 PackImmediates() const {
             // Immediates are half floats shifted.
             constexpr u32 imm_shift = 6;
             return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift)));
@@ -1033,7 +1033,7 @@ union Instruction {
         BitField<28, 2, AtomicType> type;
         BitField<30, 22, s64> offset;
 
-        s32 GetImmediateOffset() const {
+        [[nodiscard]] s32 GetImmediateOffset() const {
             return static_cast<s32>(offset << 2);
         }
     } atoms;
@@ -1215,7 +1215,7 @@ union Instruction {
             BitField<39, 4, u64> rounding;
             // H0, H1 extract for F16 missing
             BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
-            F2fRoundingOp GetRoundingMode() const {
+            [[nodiscard]] F2fRoundingOp GetRoundingMode() const {
                 constexpr u64 rounding_mask = 0x0B;
                 return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
             }
@@ -1239,15 +1239,15 @@ union Instruction {
         BitField<54, 1, u64> aoffi_flag;
         BitField<55, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1271,15 +1271,15 @@ union Instruction {
         BitField<36, 1, u64> aoffi_flag;
         BitField<37, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             return ((1ULL << component) & component_mask) != 0;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1299,7 +1299,7 @@ union Instruction {
         BitField<31, 4, u64> component_mask;
         BitField<49, 1, u64> nodep_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NODEP:
                 return nodep_flag != 0;
@@ -1309,7 +1309,7 @@ union Instruction {
             return false;
         }
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             return ((1ULL << component) & component_mask) != 0;
         }
     } txq;
@@ -1321,11 +1321,11 @@ union Instruction {
         BitField<35, 1, u64> ndv_flag;
         BitField<49, 1, u64> nodep_flag;
 
-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return (ndv_flag != 0);
@@ -1347,7 +1347,7 @@ union Instruction {
         BitField<54, 2, u64> offset_mode;
         BitField<56, 2, u64> component;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return ndv_flag != 0;
@@ -1373,7 +1373,7 @@ union Instruction {
         BitField<33, 2, u64> offset_mode;
         BitField<37, 2, u64> component;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return ndv_flag != 0;
@@ -1399,7 +1399,7 @@ union Instruction {
         BitField<52, 2, u64> component;
         BitField<55, 1, u64> fp16_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1422,16 +1422,20 @@ union Instruction {
         BitField<53, 4, u64> texture_info;
         BitField<59, 1, u64> fp32_flag;
 
-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
             // The TEXS instruction has a weird encoding for the texture type.
-            if (texture_info == 0)
+            if (texture_info == 0) {
                 return TextureType::Texture1D;
-            if (texture_info >= 1 && texture_info <= 9)
+            }
+            if (texture_info >= 1 && texture_info <= 9) {
                 return TextureType::Texture2D;
-            if (texture_info >= 10 && texture_info <= 11)
+            }
+            if (texture_info >= 10 && texture_info <= 11) {
                 return TextureType::Texture3D;
-            if (texture_info >= 12 && texture_info <= 13)
+            }
+            if (texture_info >= 12 && texture_info <= 13) {
                 return TextureType::TextureCube;
+            }
 
             LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
                          static_cast<u32>(texture_info.Value()));
@@ -1439,7 +1443,7 @@ union Instruction {
             return TextureType::Texture1D;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             switch (texture_info) {
             case 0:
             case 2:
@@ -1458,7 +1462,7 @@ union Instruction {
             return TextureProcessMode::None;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return (texture_info >= 4 && texture_info <= 6) || texture_info == 9;
@@ -1470,16 +1474,16 @@ union Instruction {
             return false;
         }
 
-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info >= 7 && texture_info <= 9;
         }
 
-        bool HasTwoDestinations() const {
+        [[nodiscard]] bool HasTwoDestinations() const {
             return gpr28.Value() != Register::ZeroIndex;
         }
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
                 {},
                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@@ -1506,7 +1510,7 @@ union Instruction {
         BitField<54, 1, u64> cl;
         BitField<55, 1, u64> process_mode;
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
         }
     } tld;
@@ -1516,7 +1520,7 @@ union Instruction {
         BitField<53, 4, u64> texture_info;
         BitField<59, 1, u64> fp32_flag;
 
-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
             // The TLDS instruction has a weird encoding for the texture type.
             if (texture_info <= 1) {
                 return TextureType::Texture1D;
@@ -1535,13 +1539,14 @@ union Instruction {
             return TextureType::Texture1D;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
-            if (texture_info == 1 || texture_info == 5 || texture_info == 12)
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
+            if (texture_info == 1 || texture_info == 5 || texture_info == 12) {
                 return TextureProcessMode::LL;
+            }
             return TextureProcessMode::LZ;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::AOFFI:
                 return texture_info == 12 || texture_info == 4;
@@ -1555,7 +1560,7 @@ union Instruction {
             return false;
         }
 
-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info == 8;
         }
@@ -1567,7 +1572,7 @@ union Instruction {
         BitField<35, 1, u64> aoffi_flag;
         BitField<49, 1, u64> nodep_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::AOFFI:
                 return aoffi_flag != 0;
@@ -1591,7 +1596,7 @@ union Instruction {
         BitField<20, 3, StoreType> store_data_layout;
         BitField<20, 4, u64> component_mask_selector;
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             ASSERT(mode == SurfaceDataMode::P);
             constexpr u8 R = 0b0001;
             constexpr u8 G = 0b0010;
@@ -1604,7 +1609,7 @@ union Instruction {
             return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
         }
 
-        StoreType GetStoreDataLayout() const {
+        [[nodiscard]] StoreType GetStoreDataLayout() const {
             ASSERT(mode == SurfaceDataMode::D_BA);
             return store_data_layout;
         }
@@ -1622,14 +1627,15 @@ union Instruction {
         BitField<20, 24, u64> target;
         BitField<5, 1, u64> constant_buffer;
 
-        s32 GetBranchTarget() const {
+        [[nodiscard]] s32 GetBranchTarget() const {
             // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
             // The branch offset is relative to the next instruction and is stored in bytes, so
             // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
         }
     } bra;
 
@@ -1637,14 +1643,15 @@ union Instruction {
         BitField<20, 24, u64> target;
         BitField<5, 1, u64> constant_buffer;
 
-        s32 GetBranchExtend() const {
+        [[nodiscard]] s32 GetBranchExtend() const {
             // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
             // The branch offset is relative to the next instruction and is stored in bytes, so
             // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
         }
     } brx;
 
@@ -1697,7 +1704,7 @@ union Instruction {
         BitField<50, 1, u64> is_op_b_register;
         BitField<51, 3, VmnmxOperation> operation;
 
-        VmnmxType SourceFormatA() const {
+        [[nodiscard]] VmnmxType SourceFormatA() const {
             switch (src_format_a) {
             case 0b11:
                 return VmnmxType::Bits32;
@@ -1708,7 +1715,7 @@ union Instruction {
             }
         }
 
-        VmnmxType SourceFormatB() const {
+        [[nodiscard]] VmnmxType SourceFormatB() const {
             switch (src_format_b) {
             case 0b11:
                 return VmnmxType::Bits32;
@@ -1739,7 +1746,7 @@ union Instruction {
         BitField<20, 14, u64> shifted_offset;
         BitField<34, 5, u64> index;
 
-        u64 GetOffset() const {
+        [[nodiscard]] u64 GetOffset() const {
             return shifted_offset * 4;
         }
     } cbuf34;
@@ -1748,7 +1755,7 @@ union Instruction {
         BitField<20, 16, s64> offset;
         BitField<36, 5, u64> index;
 
-        s64 GetOffset() const {
+        [[nodiscard]] s64 GetOffset() const {
             return offset;
         }
     } cbuf36;
@@ -1893,6 +1900,7 @@ public:
         ICMP_IMM,
         FCMP_RR,
         FCMP_RC,
+        FCMP_IMMR,
         MUFU,  // Multi-Function Operator
         RRO_C, // Range Reduction Operator
         RRO_R,
@@ -1996,29 +2004,29 @@ public:
 
     /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be
     /// conditionally executed).
-    static bool IsPredicatedInstruction(Id opcode) {
+    [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) {
         // TODO(Subv): Add the rest of unpredicated instructions.
         return opcode != Id::SSY && opcode != Id::PBK;
     }
 
     class Matcher {
     public:
-        constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type)
-            : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}
+        constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_)
+            : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {}
 
-        constexpr const char* GetName() const {
+        [[nodiscard]] constexpr const char* GetName() const {
             return name;
         }
 
-        constexpr u16 GetMask() const {
+        [[nodiscard]] constexpr u16 GetMask() const {
             return mask;
         }
 
-        constexpr Id GetId() const {
+        [[nodiscard]] constexpr Id GetId() const {
             return id;
         }
 
-        constexpr Type GetType() const {
+        [[nodiscard]] constexpr Type GetType() const {
             return type;
         }
 
@@ -2027,7 +2035,7 @@ public:
          * @param instruction The instruction to test
          * @returns true if the given instruction matches.
          */
-        constexpr bool Matches(u16 instruction) const {
+        [[nodiscard]] constexpr bool Matches(u16 instruction) const {
             return (instruction & mask) == expected;
         }
 
@@ -2039,7 +2047,8 @@ public:
         Type type;
     };
 
-    static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) {
+    using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>;
+    [[nodiscard]] static DecodeResult Decode(Instruction instr) {
         static const auto table{GetDecodeTable()};
 
         const auto matches_instruction = [instr](const auto& matcher) {
@@ -2061,7 +2070,7 @@ private:
          * A '0' in a bitstring indicates that a zero must be present at that bit position.
          * A '1' in a bitstring indicates that a one must be present at that bit position.
          */
-        static constexpr auto GetMaskAndExpect(const char* const bitstring) {
+        [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) {
             u16 mask = 0, expect = 0;
             for (std::size_t i = 0; i < opcode_bitsize; i++) {
                 const std::size_t bit_position = opcode_bitsize - i - 1;
@@ -2083,14 +2092,14 @@ private:
 
     public:
         /// Creates a matcher that can match and parse instructions based on bitstring.
-        static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type,
-                                         const char* const name) {
+        [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op,
+                                                       Type type, const char* const name) {
             const auto [mask, expected] = GetMaskAndExpect(bitstring);
             return Matcher(name, mask, expected, op, type);
         }
     };
 
-    static std::vector<Matcher> GetDecodeTable() {
+    [[nodiscard]] static std::vector<Matcher> GetDecodeTable() {
         std::vector<Matcher> table = {
 #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
             INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
@@ -2205,6 +2214,7 @@ private:
             INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
+            INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h
index 72e2a33d5..ceec05459 100644
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -41,30 +41,30 @@ struct Header {
         BitField<26, 1, u32> does_load_or_store;
         BitField<27, 1, u32> does_fp64;
         BitField<28, 4, u32> stream_out_mask;
-    } common0{};
+    } common0;
 
     union {
         BitField<0, 24, u32> shader_local_memory_low_size;
         BitField<24, 8, u32> per_patch_attribute_count;
-    } common1{};
+    } common1;
 
     union {
         BitField<0, 24, u32> shader_local_memory_high_size;
         BitField<24, 8, u32> threads_per_input_primitive;
-    } common2{};
+    } common2;
 
     union {
         BitField<0, 24, u32> shader_local_memory_crs_size;
         BitField<24, 4, OutputTopology> output_topology;
         BitField<28, 4, u32> reserved;
-    } common3{};
+    } common3;
 
     union {
         BitField<0, 12, u32> max_output_vertices;
         BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
         BitField<20, 4, u32> reserved;
         BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
-    } common4{};
+    } common4;
 
     union {
         struct {
@@ -145,7 +145,7 @@ struct Header {
             }
         } ps;
 
-        std::array<u32, 0xF> raw{};
+        std::array<u32, 0xF> raw;
     };
 
     u64 GetLocalMemorySize() const {
@@ -153,7 +153,6 @@ struct Header {
                 (common2.shader_local_memory_high_size << 24));
     }
 };
-
 static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
 
 } // namespace Tegra::Shader
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index 06cc12d5a..de6991ef6 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -74,8 +74,6 @@ public:
     }
 
     void WaitPendingFences() {
-        auto& gpu{system.GPU()};
-        auto& memory_manager{gpu.MemoryManager()};
         while (!fences.empty()) {
             TFence& current_fence = fences.front();
             if (ShouldWait()) {
@@ -83,8 +81,8 @@ public:
             }
             PopAsyncFlushes();
             if (current_fence->IsSemaphore()) {
-                memory_manager.template Write<u32>(current_fence->GetAddress(),
-                                                   current_fence->GetPayload());
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
             } else {
                 gpu.IncrementSyncPoint(current_fence->GetPayload());
             }
@@ -93,13 +91,13 @@ public:
     }
 
 protected:
-    FenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                 TTextureCache& texture_cache, TTBufferCache& buffer_cache,
-                 TQueryCache& query_cache)
-        : system{system}, rasterizer{rasterizer}, texture_cache{texture_cache},
-          buffer_cache{buffer_cache}, query_cache{query_cache} {}
+    explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
+                          TTextureCache& texture_cache_, TTBufferCache& buffer_cache_,
+                          TQueryCache& query_cache_)
+        : rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()},
+          texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {}
 
-    virtual ~FenceManager() {}
+    virtual ~FenceManager() = default;
 
     /// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is
     /// true
@@ -113,16 +111,15 @@ protected:
     /// Waits until a fence has been signalled by the host GPU.
     virtual void WaitFence(TFence& fence) = 0;
 
-    Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
     TTextureCache& texture_cache;
     TTBufferCache& buffer_cache;
     TQueryCache& query_cache;
 
 private:
     void TryReleasePendingFences() {
-        auto& gpu{system.GPU()};
-        auto& memory_manager{gpu.MemoryManager()};
         while (!fences.empty()) {
             TFence& current_fence = fences.front();
             if (ShouldWait() && !IsFenceSignaled(current_fence)) {
@@ -130,8 +127,8 @@ private:
             }
             PopAsyncFlushes();
             if (current_fence->IsSemaphore()) {
-                memory_manager.template Write<u32>(current_fence->GetAddress(),
-                                                   current_fence->GetPayload());
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
             } else {
                 gpu.IncrementSyncPoint(current_fence->GetPayload());
             }
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index acb6e6d46..ebd149c3a 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -27,9 +27,10 @@ namespace Tegra {
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 
-GPU::GPU(Core::System& system_, bool is_async_)
-    : system{system_}, dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
-      memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
+    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
+      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
       maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
       fermi_2d{std::make_unique<Engines::Fermi2D>()},
       kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
     return *dma_pusher;
 }
 
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
 const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     // Synced GPU, is always in sync
     if (!is_async) {
@@ -185,30 +194,6 @@ void GPU::SyncGuestHost() {
 void GPU::OnCommandListEnd() {
     renderer->Rasterizer().ReleaseFences();
 }
-// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
-// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
-// So the values you see in docs might be multiplied by 4.
-enum class BufferMethods {
-    BindObject = 0x0,
-    Nop = 0x2,
-    SemaphoreAddressHigh = 0x4,
-    SemaphoreAddressLow = 0x5,
-    SemaphoreSequence = 0x6,
-    SemaphoreTrigger = 0x7,
-    NotifyIntr = 0x8,
-    WrcacheFlush = 0x9,
-    Unk28 = 0xA,
-    UnkCacheFlush = 0xB,
-    RefCnt = 0x14,
-    SemaphoreAcquire = 0x1A,
-    SemaphoreRelease = 0x1B,
-    FenceValue = 0x1C,
-    FenceAction = 0x1D,
-    Unk78 = 0x1E,
-    Unk7c = 0x1F,
-    Yield = 0x20,
-    NonPullerMethods = 0x40,
-};
 
 enum class GpuSemaphoreOperation {
     AcquireEqual = 0x1,
@@ -268,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::UnkCacheFlush:
     case BufferMethods::WrcacheFlush:
     case BufferMethods::FenceValue:
+        break;
     case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
         break;
     case BufferMethods::SemaphoreTrigger: {
         ProcessSemaphoreTriggerMethod();
@@ -382,6 +372,25 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
     }
 }
 
+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}",
+                          static_cast<u32>(regs.fence_action.op.Value()));
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+}
+
 void GPU::ProcessSemaphoreTriggerMethod() {
     const auto semaphoreOperationMask = 0xF;
     const auto op =
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index c7d11deb2..21410e125 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -13,14 +13,15 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
 
 using CacheAddr = std::uintptr_t;
-inline CacheAddr ToCacheAddr(const void* host_ptr) {
+[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
     return reinterpret_cast<CacheAddr>(host_ptr);
 }
 
-inline u8* FromCacheAddr(CacheAddr cache_addr) {
+[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
     return reinterpret_cast<u8*>(cache_addr);
 }
 
@@ -148,16 +149,16 @@ public:
         u32 subchannel{};
         u32 method_count{};
 
-        bool IsLastCall() const {
-            return method_count <= 1;
-        }
-
         MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
             : method(method), argument(argument), subchannel(subchannel),
               method_count(method_count) {}
+
+        [[nodiscard]] bool IsLastCall() const {
+            return method_count <= 1;
+        }
     };
 
-    explicit GPU(Core::System& system, bool is_async);
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
     virtual ~GPU();
 
     /// Binds a renderer to the GPU.
@@ -178,10 +179,10 @@ public:
     virtual void OnCommandListEnd();
 
     /// Request a host GPU memory flush from the CPU.
-    u64 RequestFlush(VAddr addr, std::size_t size);
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
 
     /// Obtains current flush request fence id.
-    u64 CurrentFlushRequestFence() const {
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
         return current_flush_fence.load(std::memory_order_relaxed);
     }
 
@@ -189,39 +190,52 @@ public:
     void TickWork();
 
     /// Returns a reference to the Maxwell3D GPU engine.
-    Engines::Maxwell3D& Maxwell3D();
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D();
 
     /// Returns a const reference to the Maxwell3D GPU engine.
-    const Engines::Maxwell3D& Maxwell3D() const;
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    Engines::KeplerCompute& KeplerCompute();
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute();
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    const Engines::KeplerCompute& KeplerCompute() const;
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
 
     /// Returns a reference to the GPU memory manager.
-    Tegra::MemoryManager& MemoryManager();
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager();
 
     /// Returns a const reference to the GPU memory manager.
-    const Tegra::MemoryManager& MemoryManager() const;
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
 
     /// Returns a reference to the GPU DMA pusher.
-    Tegra::DmaPusher& DmaPusher();
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher();
 
-    VideoCore::RendererBase& Renderer() {
+    /// Returns a const reference to the GPU DMA pusher.
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
+
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
         return *renderer;
     }
 
-    const VideoCore::RendererBase& Renderer() const {
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
         return *renderer;
     }
 
-    VideoCore::ShaderNotify& ShaderNotify() {
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
         return *shader_notify;
     }
 
-    const VideoCore::ShaderNotify& ShaderNotify() const {
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
         return *shader_notify;
     }
 
@@ -233,24 +247,43 @@ public:
 
     void IncrementSyncPoint(u32 syncpoint_id);
 
-    u32 GetSyncpointValue(u32 syncpoint_id) const;
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
 
     void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    u64 GetTicks() const;
+    [[nodiscard]] u64 GetTicks() const;
 
-    std::unique_lock<std::mutex> LockSync() {
+    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
         return std::unique_lock{sync_mutex};
     }
 
-    bool IsAsync() const {
+    [[nodiscard]] bool IsAsync() const {
         return is_async;
     }
 
-    /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    [[nodiscard]] bool UseNvdec() const {
+        return use_nvdec;
+    }
+
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };
 
     struct Regs {
         static constexpr size_t NUM_REGS = 0x40;
@@ -262,7 +295,7 @@ public:
                     u32 address_high;
                     u32 address_low;
 
-                    GPUVAddr SemaphoreAddress() const {
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
@@ -280,10 +313,7 @@ public:
                 u32 semaphore_acquire;
                 u32 semaphore_release;
                 u32 fence_value;
-                union {
-                    BitField<4, 4, u32> operation;
-                    BitField<8, 8, u32> id;
-                } fence_action;
+                FenceAction fence_action;
                 INSERT_UNION_PADDING_WORDS(0xE2);
 
                 // Puller state
@@ -311,6 +341,9 @@ public:
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
+    /// Push GPU command buffer entries to be processed
+    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
+
     /// Swap buffers (render frame)
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
@@ -328,6 +361,8 @@ protected:
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
     void ProcessSemaphoreTriggerMethod();
     void ProcessSemaphoreRelease();
     void ProcessSemaphoreAcquire();
@@ -343,16 +378,17 @@ private:
                                u32 methods_pending);
 
     /// Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(u32 method);
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
 
 protected:
     Core::System& system;
+    std::unique_ptr<Tegra::MemoryManager> memory_manager;
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
     std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;
 
 private:
-    std::unique_ptr<Tegra::MemoryManager> memory_manager;
-
     /// Mapping of command subchannels to their bound engine ids
     std::array<EngineID, 8> bound_engines = {};
     /// 3D engine
@@ -373,6 +409,7 @@ private:
     std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
 
     std::mutex sync_mutex;
+    std::mutex device_mutex;
 
     std::condition_variable sync_cv;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 70a3d5738..a9baaf7ef 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -10,12 +10,13 @@
 
 namespace VideoCommon {
 
-GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {}
+GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec)
+    : GPU{system, true, use_nvdec}, gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
     cpu_context = renderer->GetRenderWindow().CreateSharedContext();
     cpu_context->MakeCurrent();
 }
@@ -32,6 +33,27 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
 
+void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     gpu_thread.SwapBuffers(framebuffer);
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index f89c855a5..0c0872e73 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -20,13 +20,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch final : public Tegra::GPU {
 public:
-    explicit GPUAsynch(Core::System& system);
+    explicit GPUAsynch(Core::System& system, bool use_nvdec);
     ~GPUAsynch() override;
 
     void Start() override;
     void ObtainContext() override;
     void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 1ca47ddef..ecf7bbdf3 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -7,7 +7,7 @@
 
 namespace VideoCommon {
 
-GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {}
+GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}
 
 GPUSynch::~GPUSynch() = default;
 
@@ -26,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->DispatchCalls();
 }
 
+void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clears all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     renderer->SwapBuffers(framebuffer);
 }
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 297258cb1..9d778c71a 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -19,13 +19,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU synchronously
 class GPUSynch final : public Tegra::GPU {
 public:
-    explicit GPUSynch(Core::System& system);
+    explicit GPUSynch(Core::System& system, bool use_nvdec);
     ~GPUSynch() override;
 
     void Start() override;
     void ObtainContext() override;
     void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index bf761abf2..4b8f58283 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -18,7 +18,7 @@ namespace VideoCommon::GPUThread {
 /// Runs the GPU thread
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
     std::string name = "yuzu:GPU";
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
@@ -42,6 +42,10 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
         if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
             dma_pusher.Push(std::move(submit_list->entries));
             dma_pusher.DispatchCalls();
+        } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
+            // NVDEC
+            cdma_pusher.Push(std::move(command_list->entries));
+            cdma_pusher.DispatchCalls();
         } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
             renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
         } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
@@ -75,15 +79,19 @@ ThreadManager::~ThreadManager() {
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
-                                Tegra::DmaPusher& dma_pusher) {
-    thread = std::thread{RunThread,         std::ref(system),     std::ref(renderer),
-                         std::ref(context), std::ref(dma_pusher), std::ref(state)};
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     PushCommand(SubmitListCommand(std::move(entries)));
 }
 
+void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
+    PushCommand(SubmitChCommandEntries(std::move(entries)));
+}
+
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 5a28335d6..32a34e3a7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -37,6 +37,14 @@ struct SubmitListCommand final {
     Tegra::CommandList entries;
 };
 
+/// Command to signal to the GPU thread that a cdma command list is ready for processing
+struct SubmitChCommandEntries final {
+    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries)
+        : entries{std::move(entries)} {}
+
+    Tegra::ChCommandHeaderList entries;
+};
+
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
     explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
@@ -77,9 +85,9 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};
 
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
-                 GPUTickCommand>;
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
 
 struct CommandDataContainer {
     CommandDataContainer() = default;
@@ -109,11 +117,14 @@ public:
 
     /// Creates and starts the GPU thread.
     void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                     Tegra::DmaPusher& dma_pusher);
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
 
     /// Push GPU command entries to be processed
     void SubmitList(Tegra::CommandList&& entries);
 
+    /// Push GPU CDMA command buffer entries to be processed
+    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
+
     /// Swap buffers (render frame)
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
 
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
new file mode 100644
index 000000000..c157724a9
--- /dev/null
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(SHADER_SOURCES
+    opengl_present.frag
+    opengl_present.vert
+)
+
+set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
+set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
+set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
+
+set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
+set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
+
+foreach(FILENAME IN ITEMS ${SHADER_SOURCES})
+    string(REPLACE "." "_" SHADER_NAME ${FILENAME})
+    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
+    set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
+    add_custom_command(
+        OUTPUT
+            ${HEADER_FILE}
+        COMMAND
+            ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE}
+        MAIN_DEPENDENCY
+            ${SOURCE_FILE}
+        DEPENDS
+            ${INPUT_FILE}
+            # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
+    )
+    set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE})
+endforeach()
+
+add_custom_target(host_shaders
+    DEPENDS
+        ${SHADER_HEADERS}
+    SOURCES
+        ${SHADER_SOURCES}
+)
diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake
new file mode 100644
index 000000000..c0fc49768
--- /dev/null
+++ b/src/video_core/host_shaders/StringShaderHeader.cmake
@@ -0,0 +1,13 @@
+set(SOURCE_FILE ${CMAKE_ARGV3})
+set(HEADER_FILE ${CMAKE_ARGV4})
+set(INPUT_FILE ${CMAKE_ARGV5})
+
+get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME)
+string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME})
+string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
+
+file(READ ${SOURCE_FILE} CONTENTS)
+
+get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
+make_directory(${OUTPUT_DIR})
+configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)
diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag
new file mode 100644
index 000000000..8a4cb024b
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -0,0 +1,10 @@
+#version 430 core
+
+layout (location = 0) in vec2 frag_tex_coord;
+layout (location = 0) out vec4 color;
+
+layout (binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
+}
diff --git a/src/video_core/host_shaders/opengl_present.vert b/src/video_core/host_shaders/opengl_present.vert
new file mode 100644
index 000000000..2235d31a4
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_present.vert
@@ -0,0 +1,24 @@
+#version 430 core
+
+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
+layout (location = 0) in vec2 vert_position;
+layout (location = 1) in vec2 vert_tex_coord;
+layout (location = 0) out vec2 frag_tex_coord;
+
+// This is a truncated 3x3 matrix for 2D transformations:
+// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
+// The third column performs translation.
+// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
+// implicitly be [0, 0, 1]
+layout (location = 0) uniform mat3x2 modelview_matrix;
+
+void main() {
+    // Multiply input position by the rotscale part of the matrix and then manually translate by
+    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
+    // to `vec3(vert_position.xy, 1.0)`
+    gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
+    frag_tex_coord = vert_tex_coord;
+}
diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in
new file mode 100644
index 000000000..ccdb0d2a9
--- /dev/null
+++ b/src/video_core/host_shaders/source_shader.h.in
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <string_view>
+
+namespace HostShaders {
+
+constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)";
+
+} // namespace HostShaders
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index a50e7b4e0..cd21a2112 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -36,7 +36,7 @@ void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
         }
     } else {
         // Macro not compiled, check if it's uploaded and if so, compile it
-        std::optional<u32> mid_method = std::nullopt;
+        std::optional<u32> mid_method;
         const auto macro_code = uploaded_macro_code.find(method);
         if (macro_code == uploaded_macro_code.end()) {
             for (const auto& [method_base, code] : uploaded_macro_code) {
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index c1b9e4ad9..954b87515 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -14,11 +14,11 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255
 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
 
 namespace Tegra {
-static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
-static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
-static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
-static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
-static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
+constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
 
 static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
     STATE,
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index c217f5bb2..6e70bd362 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -11,6 +11,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra {
 
@@ -44,6 +45,12 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
     return Map(cpu_addr, *FindFreeRange(size, align), size);
 }
 
+GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
+    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
+    ASSERT(gpu_addr);
+    return Map(cpu_addr, *gpu_addr, size);
+}
+
 void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
     if (!size) {
         return;
@@ -58,7 +65,7 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
 std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) {
     for (u64 offset{}; offset < size; offset += page_size) {
         if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) {
-            return {};
+            return std::nullopt;
         }
     }
 
@@ -108,7 +115,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
     page_table[PageEntryIndex(gpu_addr)] = page_entry;
 }
 
-std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const {
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
+                                                     bool start_32bit_address) const {
     if (!align) {
         align = page_size;
     } else {
@@ -116,7 +124,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
     }
 
     u64 available_size{};
-    GPUVAddr gpu_addr{address_space_start};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
     while (gpu_addr + available_size < address_space_size) {
         if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
             available_size += page_size;
@@ -135,13 +143,13 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
         }
     }
 
-    return {};
+    return std::nullopt;
 }
 
 std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
     const auto page_entry{GetPageEntry(gpu_addr)};
     if (!page_entry.IsValid()) {
-        return {};
+        return std::nullopt;
     }
 
     return page_entry.ToAddress() + (gpu_addr & page_mask);
@@ -316,10 +324,10 @@ void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_add
     WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
+bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
     const auto cpu_addr{GpuToCpuAddress(gpu_addr)};
     if (!cpu_addr) {
-        return {};
+        return false;
     }
     const std::size_t page{(*cpu_addr & Core::Memory::PAGE_MASK) + size};
     return page <= Core::Memory::PAGE_SIZE;
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 8953fcb53..c078193d9 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -31,19 +31,19 @@ public:
     constexpr PageEntry(State state) : state{state} {}
     constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}
 
-    constexpr bool IsUnmapped() const {
+    [[nodiscard]] constexpr bool IsUnmapped() const {
         return state == State::Unmapped;
     }
 
-    constexpr bool IsAllocated() const {
+    [[nodiscard]] constexpr bool IsAllocated() const {
         return state == State::Allocated;
     }
 
-    constexpr bool IsValid() const {
+    [[nodiscard]] constexpr bool IsValid() const {
         return !IsUnmapped() && !IsAllocated();
     }
 
-    constexpr VAddr ToAddress() const {
+    [[nodiscard]] constexpr VAddr ToAddress() const {
         if (!IsValid()) {
             return {};
         }
@@ -51,7 +51,7 @@ public:
         return static_cast<VAddr>(state) << ShiftBits;
     }
 
-    constexpr PageEntry operator+(u64 offset) {
+    [[nodiscard]] constexpr PageEntry operator+(u64 offset) const {
         // If this is a reserved value, offsets do not apply
         if (!IsValid()) {
             return *this;
@@ -74,16 +74,16 @@ public:
     /// Binds a renderer to the memory manager.
     void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
 
-    std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
+    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
 
     template <typename T>
-    T Read(GPUVAddr addr) const;
+    [[nodiscard]] T Read(GPUVAddr addr) const;
 
     template <typename T>
     void Write(GPUVAddr addr, T data);
 
-    u8* GetPointer(GPUVAddr addr);
-    const u8* GetPointer(GPUVAddr addr) const;
+    [[nodiscard]] u8* GetPointer(GPUVAddr addr);
+    [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const;
 
     /**
      * ReadBlock and WriteBlock are full read and write operations over virtual
@@ -112,29 +112,32 @@ public:
     /**
      * IsGranularRange checks if a gpu region can be simply read with a pointer.
      */
-    bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const;
 
-    GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
-    GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
-    std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
-    GPUVAddr Allocate(std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
+    [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
     void Unmap(GPUVAddr gpu_addr, std::size_t size);
 
 private:
-    PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
+    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
     void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
     GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
-    std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const;
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
+                                                        bool start_32bit_address = false) const;
 
     void TryLockPage(PageEntry page_entry, std::size_t size);
     void TryUnlockPage(PageEntry page_entry, std::size_t size);
 
-    static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
+    [[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
         return (gpu_addr >> page_bits) & page_table_mask;
     }
 
     static constexpr u64 address_space_size = 1ULL << 40;
     static constexpr u64 address_space_start = 1ULL << 32;
+    static constexpr u64 address_space_start_low = 1ULL << 16;
     static constexpr u64 page_bits{16};
     static constexpr u64 page_size{1 << page_bits};
     static constexpr u64 page_mask{page_size - 1};
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 0d3a88765..fc54ca0ef 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -91,14 +91,15 @@ private:
     std::shared_ptr<HostCounter> last;
 };
 
-template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
-          class QueryPool>
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
 class QueryCacheBase {
 public:
-    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
-        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
-                                                      static_cast<QueryCache&>(*this),
-                                                      VideoCore::QueryType::SamplesPassed}}} {}
+    explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_,
+                            Tegra::Engines::Maxwell3D& maxwell3d_,
+                            Tegra::MemoryManager& gpu_memory_)
+        : rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+          gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this),
+                                                          VideoCore::QueryType::SamplesPassed}}} {}
 
     void InvalidateRegion(VAddr addr, std::size_t size) {
         std::unique_lock lock{mutex};
@@ -118,29 +119,27 @@ public:
      */
     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
         std::unique_lock lock{mutex};
-        auto& memory_manager = system.GPU().MemoryManager();
-        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr_opt);
-        VAddr cpu_addr = *cpu_addr_opt;
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
 
-        CachedQuery* query = TryGet(cpu_addr);
+        CachedQuery* query = TryGet(*cpu_addr);
         if (!query) {
-            ASSERT_OR_EXECUTE(cpu_addr_opt, return;);
-            const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+            u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
 
-            query = Register(type, cpu_addr, host_ptr, timestamp.has_value());
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
         }
 
         query->BindCounter(Stream(type).Current(), timestamp);
         if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
-            AsyncFlushQuery(cpu_addr);
+            AsyncFlushQuery(*cpu_addr);
         }
     }
 
     /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
     void UpdateCounters() {
         std::unique_lock lock{mutex};
-        const auto& regs = system.GPU().Maxwell3D().regs;
+        const auto& regs = maxwell3d.regs;
         Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
     }
 
@@ -206,9 +205,6 @@ public:
         committed_flushes.pop_front();
     }
 
-protected:
-    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
-
 private:
     /// Flushes a memory range to guest memory and removes it from the cache.
     void FlushAndRemoveRegion(VAddr addr, std::size_t size) {
@@ -270,8 +266,9 @@ private:
     static constexpr std::uintptr_t PAGE_SIZE = 4096;
     static constexpr unsigned PAGE_BITS = 12;
 
-    Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::MemoryManager& gpu_memory;
 
     std::recursive_mutex mutex;
 
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 3cbdac8e7..27ef4c69a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size
 
 class RasterizerInterface {
 public:
-    virtual ~RasterizerInterface() {}
+    virtual ~RasterizerInterface() = default;
 
     /// Dispatches a draw invocation
     virtual void Draw(bool is_indexed, bool is_instanced) = 0;
@@ -90,15 +90,16 @@ public:
     virtual void TickFrame() = 0;
 
     /// Attempt to use a faster method to perform a surface copy
-    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
-                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                                       const Tegra::Engines::Fermi2D::Config& copy_config) {
+    [[nodiscard]] virtual bool AccelerateSurfaceCopy(
+        const Tegra::Engines::Fermi2D::Regs::Surface& src,
+        const Tegra::Engines::Fermi2D::Regs::Surface& dst,
+        const Tegra::Engines::Fermi2D::Config& copy_config) {
         return false;
     }
 
     /// Attempt to use a faster method to display the framebuffer to screen
-    virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
-                                   u32 pixel_stride) {
+    [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
+                                                 VAddr framebuffer_addr, u32 pixel_stride) {
         return false;
     }
 
@@ -106,19 +107,16 @@ public:
     virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
 
     /// Initialize disk cached resources for the game being emulated
-    virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
-                                   const DiskResourceLoadCallback& callback = {}) {}
-
-    /// Initializes renderer dirty flags
-    virtual void SetupDirtyFlags() {}
+    virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+                                   const DiskResourceLoadCallback& callback) {}
 
     /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    GuestDriverProfile& AccessGuestDriverProfile() {
+    [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
         return guest_driver_profile;
     }
 
     /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    const GuestDriverProfile& AccessGuestDriverProfile() const {
+    [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
         return guest_driver_profile;
     }
 
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 649074acd..51dde8eb5 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -38,7 +38,7 @@ public:
     virtual ~RendererBase();
 
     /// Initialize the renderer
-    virtual bool Init() = 0;
+    [[nodiscard]] virtual bool Init() = 0;
 
     /// Shutdown the renderer
     virtual void ShutDown() = 0;
@@ -46,51 +46,46 @@ public:
     /// Finalize rendering the guest frame and draw into the presentation texture
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
-    /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer
-    /// specific implementation)
-    /// Returns true if a frame was drawn
-    virtual bool TryPresent(int timeout_ms) = 0;
-
     // Getter/setter functions:
     // ------------------------
 
-    f32 GetCurrentFPS() const {
+    [[nodiscard]] f32 GetCurrentFPS() const {
         return m_current_fps;
     }
 
-    int GetCurrentFrame() const {
+    [[nodiscard]] int GetCurrentFrame() const {
         return m_current_frame;
     }
 
-    RasterizerInterface& Rasterizer() {
+    [[nodiscard]] RasterizerInterface& Rasterizer() {
         return *rasterizer;
     }
 
-    const RasterizerInterface& Rasterizer() const {
+    [[nodiscard]] const RasterizerInterface& Rasterizer() const {
         return *rasterizer;
     }
 
-    Core::Frontend::GraphicsContext& Context() {
+    [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
         return *context;
     }
 
-    const Core::Frontend::GraphicsContext& Context() const {
+    [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
         return *context;
     }
 
-    Core::Frontend::EmuWindow& GetRenderWindow() {
+    [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
         return render_window;
     }
 
-    const Core::Frontend::EmuWindow& GetRenderWindow() const {
+    [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
         return render_window;
     }
 
-    RendererSettings& Settings() {
+    [[nodiscard]] RendererSettings& Settings() {
         return renderer_settings;
     }
 
-    const RendererSettings& Settings() const {
+    [[nodiscard]] const RendererSettings& Settings() const {
         return renderer_settings;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
index b7e9ed2e9..d6120c23e 100644
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -39,8 +39,8 @@ using Operation = const OperationNode&;
 constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
 
 char Swizzle(std::size_t component) {
-    ASSERT(component < 4);
-    return component["xyzw"];
+    static constexpr std::string_view SWIZZLE{"xyzw"};
+    return SWIZZLE.at(component);
 }
 
 constexpr bool IsGenericAttribute(Attribute::Index index) {
@@ -224,7 +224,7 @@ private:
 
     std::string Visit(const Node& node);
 
-    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
     std::string BuildAoffi(Operation);
     std::string GlobalMemoryPointer(const GmemNode& gmem);
     void Exit();
@@ -376,9 +376,11 @@ private:
         std::string temporary = AllocTemporary();
         std::string address;
         std::string_view opname;
+        bool robust = false;
         if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
             address = GlobalMemoryPointer(*gmem);
             opname = "ATOM";
+            robust = true;
         } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
             address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
             opname = "ATOMS";
@@ -386,7 +388,15 @@ private:
             UNREACHABLE();
             return "{0, 0, 0, 0}";
         }
+        if (robust) {
+            AddLine("IF NE.x;");
+        }
         AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        if (robust) {
+            AddLine("ELSE;");
+            AddLine("MOV.S {}, 0;", temporary);
+            AddLine("ENDIF;");
+        }
         return temporary;
     }
 
@@ -980,10 +990,9 @@ void ARBDecompiler::DeclareLocalMemory() {
 }
 
 void ARBDecompiler::DeclareGlobalMemory() {
-    const std::size_t num_entries = ir.GetGlobalMemory().size();
+    const size_t num_entries = ir.GetGlobalMemory().size();
     if (num_entries > 0) {
-        const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2;
-        AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1);
+        AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1);
     }
 }
 
@@ -1363,7 +1372,8 @@ std::string ARBDecompiler::Visit(const Node& node) {
 
     if (const auto gmem = std::get_if<GmemNode>(&*node)) {
         std::string temporary = AllocTemporary();
-        AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem));
+        AddLine("MOV {}, 0;", temporary);
+        AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem));
         return temporary;
     }
 
@@ -1406,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) {
     return {};
 }
 
-std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
     const auto& meta = std::get<MetaTexture>(operation.GetMeta());
     UNIMPLEMENTED_IF(meta.sampler.is_indexed);
-    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
-                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
 
+    const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
+                             meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
     const std::size_t count = operation.GetOperandsCount();
     std::string temporary = AllocVectorTemporary();
     std::size_t i = 0;
@@ -1419,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati
         AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
     }
     if (meta.sampler.is_array) {
-        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
+        ++i;
     }
     if (meta.sampler.is_shadow) {
-        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+        std::string compare = Visit(meta.depth_compare);
+        if (is_extended) {
+            ASSERT(i == 4);
+            std::string extra_coord = AllocVectorTemporary();
+            AddLine("MOV.F {}.x, {};", extra_coord, compare);
+            return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
+        }
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
+        ++i;
     }
-    return {std::move(temporary), i};
+    return {temporary, temporary, i};
 }
 
 std::string ARBDecompiler::BuildAoffi(Operation operation) {
@@ -1441,18 +1460,21 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) {
 }
 
 std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
+    // Read a bindless SSBO, return its address and set CC accordingly
+    // address = c[binding].xy
+    // length  = c[binding].z
     const u32 binding = global_memory_names.at(gmem.GetDescriptor());
-    const char result_swizzle = binding % 2 == 0 ? 'x' : 'y';
 
     const std::string pointer = AllocLongVectorTemporary();
     std::string temporary = AllocTemporary();
 
-    const u32 local_index = binding / 2;
-    AddLine("PK64.U {}, c[{}];", pointer, local_index);
+    AddLine("PK64.U {}, c[{}];", pointer, binding);
     AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
             Visit(gmem.GetBaseAddress()));
     AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
-    AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer);
+    AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer);
+    // Compare offset to length and set CC
+    AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding);
     return fmt::format("{}.x", pointer);
 }
 
@@ -1552,7 +1574,9 @@ std::string ARBDecompiler::Assign(Operation operation) {
         ResetTemporaries();
         return {};
     } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        AddLine("IF NE.x;");
         AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
+        AddLine("ENDIF;");
         ResetTemporaries();
         return {};
     } else {
@@ -1844,7 +1868,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
 std::string ARBDecompiler::Texture(Operation operation) {
     const auto& meta = std::get<MetaTexture>(operation.GetMeta());
     const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
 
     std::string_view opcode = "TEX";
     std::string extra;
@@ -1873,7 +1897,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
         }
     }
 
-    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
             TextureType(meta), BuildAoffi(operation));
     AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
     return fmt::format("{}.x", temporary);
@@ -1882,7 +1906,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
 std::string ARBDecompiler::TextureGather(Operation operation) {
     const auto& meta = std::get<MetaTexture>(operation.GetMeta());
     const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
 
     std::string comp;
     if (!meta.sampler.is_shadow) {
@@ -1892,7 +1916,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) {
 
     AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
             TextureType(meta), BuildAoffi(operation));
-    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
     return fmt::format("{}.x", temporary);
 }
 
@@ -1930,13 +1954,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) {
 std::string ARBDecompiler::TexelFetch(Operation operation) {
     const auto& meta = std::get<MetaTexture>(operation.GetMeta());
     const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
 
     if (!meta.sampler.is_buffer) {
         ASSERT(swizzle < 4);
         AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
     }
-    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
             BuildAoffi(operation));
     AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
     return fmt::format("{}.x", temporary);
@@ -1947,7 +1971,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) {
     const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
     const std::string ddx = AllocVectorTemporary();
     const std::string ddy = AllocVectorTemporary();
-    const std::string coord = BuildCoords(operation).first;
+    const std::string coord = std::get<1>(BuildCoords(operation));
 
     const std::size_t num_components = meta.derivates.size() / 2;
     for (std::size_t index = 0; index < num_components; ++index) {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index e866d8f2f..b1c4cd62f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -59,9 +59,10 @@ void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst
                              static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }
 
-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                               Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                                const Device& device_, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system,
+    : GenericBufferCache{rasterizer, gpu_memory, cpu_memory,
                          std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
       device{device_} {
     if (!device.HasFastBufferSubData()) {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 88fdc0536..f75b32e31 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -52,7 +52,8 @@ private:
 using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+    explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                            Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index e7d95149f..a94e4f72e 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -193,7 +193,6 @@ bool IsASTCSupported() {
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 3d2588dd2..b532fdcc2 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -45,11 +45,10 @@ void GLInnerFence::Wait() {
     glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
 }
 
-FenceManagerOpenGL::FenceManagerOpenGL(Core::System& system,
-                                       VideoCore::RasterizerInterface& rasterizer,
+FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
                                        TextureCacheOpenGL& texture_cache,
                                        OGLBufferCache& buffer_cache, QueryCache& query_cache)
-    : GenericFenceManager(system, rasterizer, texture_cache, buffer_cache, query_cache) {}
+    : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {}
 
 Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
     return std::make_shared<GLInnerFence>(value, is_stubbed);
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h
index 1686cf5c8..da1dcdace 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.h
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -37,9 +37,9 @@ using GenericFenceManager =
 
 class FenceManagerOpenGL final : public GenericFenceManager {
 public:
-    FenceManagerOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                       TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
-                       QueryCache& query_cache);
+    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                                TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
+                                QueryCache& query_cache);
 
 protected:
     Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
index d7ba57aca..1a3d9720e 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -30,12 +30,11 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
 
 } // Anonymous namespace
 
-QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
-    : VideoCommon::QueryCacheBase<
-          QueryCache, CachedQuery, CounterStream, HostCounter,
-          std::vector<OGLQuery>>{system,
-                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
-      gl_rasterizer{gl_rasterizer} {}
+QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+                       Tegra::MemoryManager& gpu_memory)
+    : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>(
+          rasterizer, maxwell3d, gpu_memory),
+      gl_rasterizer{rasterizer} {}
 
 QueryCache::~QueryCache() = default;
 
@@ -90,6 +89,8 @@ u64 HostCounter::BlockingQuery() const {
 CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
     : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
 
+CachedQuery::~CachedQuery() = default;
+
 CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
     : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
 
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
index d8e7052a1..82cac51ee 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.h
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -26,10 +26,11 @@ class RasterizerOpenGL;
 
 using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
 
-class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
-                                                            HostCounter, std::vector<OGLQuery>> {
+class QueryCache final
+    : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
 public:
-    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+                        Tegra::MemoryManager& gpu_memory);
     ~QueryCache();
 
     OGLQuery AllocateQuery(VideoCore::QueryType type);
@@ -40,6 +41,7 @@ public:
 
 private:
     RasterizerOpenGL& gl_rasterizer;
+    std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools;
 };
 
 class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
@@ -62,10 +64,12 @@ class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
 public:
     explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
                          u8* host_ptr);
-    CachedQuery(CachedQuery&& rhs) noexcept;
-    CachedQuery(const CachedQuery&) = delete;
+    ~CachedQuery() override;
 
+    CachedQuery(CachedQuery&& rhs) noexcept;
     CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+
+    CachedQuery(const CachedQuery&) = delete;
     CachedQuery& operator=(const CachedQuery&) = delete;
 
     void Flush() override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 4af5824cd..cfddbde5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -139,30 +139,29 @@ void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
 
-void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) {
-    if (num_entries == 0) {
+void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
+    if (num_ssbos == 0) {
         return;
     }
-    if (num_entries % 2 == 1) {
-        pointers[num_entries] = 0;
-    }
-    const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2);
-    glProgramLocalParametersI4uivNV(target, 0, num_vectors,
-                                    reinterpret_cast<const GLuint*>(pointers));
+    glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
+                                    reinterpret_cast<const GLuint*>(ssbos));
 }
 
 } // Anonymous namespace
 
-RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   const Device& device, ScreenInfo& info,
-                                   ProgramManager& program_manager, StateTracker& state_tracker)
-    : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
-                                                                            state_tracker},
-      shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
-      fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
-      screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker},
-      async_shaders{emu_window} {
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
+                                   Core::Memory::Memory& cpu_memory, const Device& device_,
+                                   ScreenInfo& screen_info_, ProgramManager& program_manager_,
+                                   StateTracker& state_tracker_)
+    : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
+      kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
+      screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
+      texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker),
+      shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device),
+      query_cache(*this, maxwell3d, gpu_memory),
+      buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE),
+      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
+      async_shaders(emu_window) {
     CheckExtensions();
 
     unified_uniform_buffer.Create();
@@ -196,8 +195,7 @@ void RasterizerOpenGL::CheckExtensions() {
 }
 
 void RasterizerOpenGL::SetupVertexFormat() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexFormats]) {
         return;
     }
@@ -217,7 +215,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
         }
         flags[Dirty::VertexFormat0 + index] = false;
 
-        const auto attrib = gpu.regs.vertex_attrib_format[index];
+        const auto attrib = maxwell3d.regs.vertex_attrib_format[index];
         const auto gl_index = static_cast<GLuint>(index);
 
         // Disable constant attributes.
@@ -241,8 +239,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
 }
 
 void RasterizerOpenGL::SetupVertexBuffer() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexBuffers]) {
         return;
     }
@@ -253,7 +250,7 @@ void RasterizerOpenGL::SetupVertexBuffer() {
     const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
 
     // Upload all guest vertex arrays sequentially to our buffer
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
@@ -290,14 +287,13 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 }
 
 void RasterizerOpenGL::SetupVertexInstances() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexInstances]) {
         return;
     }
     flags[Dirty::VertexInstances] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
@@ -313,7 +309,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
 
 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     const std::size_t size = CalculateIndexBufferSize();
     const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
     glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
@@ -322,15 +318,14 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     MICROPROFILE_SCOPE(OpenGL_Shader);
-    auto& gpu = system.GPU().Maxwell3D();
     u32 clip_distances = 0;
 
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
-        const auto& shader_config = gpu.regs.shader_config[index];
+        const auto& shader_config = maxwell3d.regs.shader_config[index];
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             switch (program) {
             case Maxwell::ShaderProgram::Geometry:
                 program_manager.UseGeometryShader(0);
@@ -391,11 +386,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     }
 
     SyncClipEnabled(clip_distances);
-    gpu.dirty.flags[Dirty::Shaders] = false;
+    maxwell3d.dirty.flags[Dirty::Shaders] = false;
 }
 
 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -413,34 +408,27 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
 }
 
 std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-
-    return static_cast<std::size_t>(regs.index_array.count) *
-           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+    return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
+           static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
 }
 
-void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
+void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
                                          const VideoCore::DiskResourceLoadCallback& callback) {
-    shader_cache.LoadDiskCache(stop_loading, callback);
-}
-
-void RasterizerOpenGL::SetupDirtyFlags() {
-    state_tracker.Initialize();
+    shader_cache.LoadDiskCache(title_id, stop_loading, callback);
 }
 
 void RasterizerOpenGL::ConfigureFramebuffers() {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    auto& gpu = system.GPU().Maxwell3D();
-    if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
+    if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
         return;
     }
-    gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
+    maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
 
     texture_cache.GuardRenderTargets(true);
 
     View depth_surface = texture_cache.GetDepthBufferSurface(true);
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
 
     // Bind the framebuffer surfaces
@@ -472,8 +460,7 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
 }
 
 void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) {
-    auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
 
     texture_cache.GuardRenderTargets(true);
     View color_surface;
@@ -523,12 +510,11 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_de
 }
 
 void RasterizerOpenGL::Clear() {
-    const auto& gpu = system.GPU().Maxwell3D();
-    if (!gpu.ShouldExecute()) {
+    if (!maxwell3d.ShouldExecute()) {
         return;
     }
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     bool use_color{};
     bool use_depth{};
     bool use_stencil{};
@@ -593,7 +579,6 @@ void RasterizerOpenGL::Clear() {
 
 void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(OpenGL_Drawing);
-    auto& gpu = system.GPU().Maxwell3D();
 
     query_cache.UpdateCounters();
 
@@ -641,7 +626,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     if (invalidated) {
         // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
-        auto& dirty = gpu.dirty.flags;
+        auto& dirty = maxwell3d.dirty.flags;
         dirty[Dirty::VertexBuffers] = true;
         for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
             dirty[index] = true;
@@ -662,7 +647,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     // Setup emulation uniform buffer.
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
-        ubo.SetFromRegs(gpu);
+        ubo.SetFromRegs(maxwell3d);
         const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
         glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
@@ -671,7 +656,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
-    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
@@ -688,14 +673,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     BeginTransformFeedback(primitive_mode);
 
-    const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
+    const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance);
     const GLsizei num_instances =
-        static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
+        static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1);
     if (is_indexed) {
-        const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
-        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
+        const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
+        const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
         const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
-        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
         if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
             glDrawElements(primitive_mode, num_vertices, format, offset);
         } else if (num_instances == 1 && base_instance == 0) {
@@ -714,8 +699,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                                                           base_instance);
         }
     } else {
-        const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
-        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
+        const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first);
+        const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count);
         if (num_instances == 1 && base_instance == 0) {
             glDrawArrays(primitive_mode, base_vertex, num_vertices);
         } else if (base_instance == 0) {
@@ -730,7 +715,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     ++num_queued_commands;
 
-    system.GPU().TickWork();
+    gpu.TickWork();
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
@@ -753,7 +738,8 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
 
     buffer_cache.Unmap();
 
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
+    program_manager.BindCompute(kernel->GetHandle());
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
     ++num_queued_commands;
 }
@@ -815,17 +801,14 @@ void RasterizerOpenGL::SyncGuestHost() {
 }
 
 void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
-        auto& memory_manager{gpu.MemoryManager()};
-        memory_manager.Write<u32>(addr, value);
+        gpu_memory.Write<u32>(addr, value);
         return;
     }
     fence_manager.SignalSemaphore(addr, value);
 }
 
 void RasterizerOpenGL::SignalSyncPoint(u32 value) {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
         gpu.IncrementSyncPoint(value);
         return;
@@ -834,7 +817,6 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) {
 }
 
 void RasterizerOpenGL::ReleaseFences() {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
         return;
     }
@@ -914,13 +896,13 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 }
 
 void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
-    static constexpr std::array PARAMETER_LUT = {
-        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+    static constexpr std::array PARAMETER_LUT{
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
         GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
-        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
-
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+    };
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& stages = maxwell3d.state.shader_stages;
     const auto& shader_stage = stages[stage_index];
     const auto& entries = shader->GetEntries();
     const bool use_unified = entries.use_unified_uniforms;
@@ -945,7 +927,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* sh
 
 void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
     const auto& entries = kernel->GetEntries();
     const bool use_unified = entries.use_unified_uniforms;
 
@@ -1018,57 +1000,56 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh
         GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
     };
 
-    auto& gpu{system.GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto& cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
+    const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
     const auto& entries{shader->GetEntries().global_memory_entries};
 
-    std::array<GLuint64EXT, 32> pointers;
-    ASSERT(entries.size() < pointers.size());
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());
 
     const bool assembly_shaders = device.UseAssemblyShaders();
     u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
     for (const auto& entry : entries) {
         const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
-        const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
-        const u32 size{memory_manager.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
+        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+        const u32 size{gpu_memory.Read<u32>(addr + 8)};
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
         ++binding;
     }
     if (assembly_shaders) {
-        UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size());
+        UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
     }
 }
 
 void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
-    auto& gpu{system.GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto& cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
     const auto& entries{kernel->GetEntries().global_memory_entries};
 
-    std::array<GLuint64EXT, 32> pointers;
-    ASSERT(entries.size() < pointers.size());
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());
 
     u32 binding = 0;
     for (const auto& entry : entries) {
         const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
-        const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
-        const u32 size{memory_manager.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
+        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+        const u32 size{gpu_memory.Read<u32>(addr + 8)};
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
         ++binding;
     }
     if (device.UseAssemblyShaders()) {
-        UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size());
+        UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
     }
 }
 
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
-                                         GPUVAddr gpu_addr, std::size_t size,
-                                         GLuint64EXT* pointer) {
-    const std::size_t alignment{device.GetShaderStorageBufferAlignment()};
+                                         GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
+    const size_t alignment{device.GetShaderStorageBufferAlignment()};
     const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
     if (device.UseAssemblyShaders()) {
-        *pointer = info.address + info.offset;
+        *ssbo = BindlessSSBO{
+            .address = static_cast<GLuint64EXT>(info.address + info.offset),
+            .length = static_cast<GLsizei>(size),
+            .padding = 0,
+        };
     } else {
         glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(size));
@@ -1077,7 +1058,6 @@ void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& e
 
 void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
-    const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
     for (const auto& entry : shader->GetEntries().samplers) {
         const auto shader_type = static_cast<ShaderType>(stage_index);
@@ -1090,11 +1070,10 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader
 
 void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
-    const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : kernel->GetEntries().samplers) {
         for (std::size_t i = 0; i < entry.size; ++i) {
-            const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
+            const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i);
             SetupTexture(binding++, texture, entry);
         }
     }
@@ -1118,20 +1097,18 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
 }
 
 void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
-    const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).image;
     for (const auto& entry : shader->GetEntries().images) {
-        const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
+        const auto shader_type = static_cast<ShaderType>(stage_index);
         const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
         SetupImage(binding++, tic, entry);
     }
 }
 
 void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
-    const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : shader->GetEntries().images) {
-        const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
+        const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic;
         SetupImage(binding++, tic, entry);
     }
 }
@@ -1151,9 +1128,8 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
 }
 
 void RasterizerOpenGL::SyncViewport() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     const bool dirty_viewport = flags[Dirty::Viewports];
     const bool dirty_clip_control = flags[Dirty::ClipControl];
@@ -1180,7 +1156,7 @@ void RasterizerOpenGL::SyncViewport() {
         flags[Dirty::ClipControl] = false;
 
         bool flip_y = false;
-        if (regs.viewport_transform[0].scale_y < 0.0) {
+        if (regs.viewport_transform[0].scale_y < 0.0f) {
             flip_y = !flip_y;
         }
         if (regs.screen_y_control.y_negate != 0) {
@@ -1225,25 +1201,23 @@ void RasterizerOpenGL::SyncViewport() {
 }
 
 void RasterizerOpenGL::SyncDepthClamp() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::DepthClampEnabled]) {
         return;
     }
     flags[Dirty::DepthClampEnabled] = false;
 
-    oglEnable(GL_DEPTH_CLAMP, gpu.regs.view_volume_clip_control.depth_clamp_disabled == 0);
+    oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0);
 }
 
 void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) {
         return;
     }
     flags[Dirty::ClipDistances] = false;
 
-    clip_mask &= gpu.regs.clip_distance_enabled;
+    clip_mask &= maxwell3d.regs.clip_distance_enabled;
     if (clip_mask == last_clip_distance_mask) {
         return;
     }
@@ -1259,9 +1233,8 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     if (flags[Dirty::CullTest]) {
         flags[Dirty::CullTest] = false;
@@ -1276,26 +1249,24 @@ void RasterizerOpenGL::SyncCullMode() {
 }
 
 void RasterizerOpenGL::SyncPrimitiveRestart() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PrimitiveRestart]) {
         return;
     }
     flags[Dirty::PrimitiveRestart] = false;
 
-    if (gpu.regs.primitive_restart.enabled) {
+    if (maxwell3d.regs.primitive_restart.enabled) {
         glEnable(GL_PRIMITIVE_RESTART);
-        glPrimitiveRestartIndex(gpu.regs.primitive_restart.index);
+        glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index);
     } else {
         glDisable(GL_PRIMITIVE_RESTART);
     }
 }
 
 void RasterizerOpenGL::SyncDepthTestState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
-    const auto& regs = gpu.regs;
     if (flags[Dirty::DepthMask]) {
         flags[Dirty::DepthMask] = false;
         glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE);
@@ -1313,14 +1284,13 @@ void RasterizerOpenGL::SyncDepthTestState() {
 }
 
 void RasterizerOpenGL::SyncStencilTestState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::StencilTest]) {
         return;
     }
     flags[Dirty::StencilTest] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_STENCIL_TEST, regs.stencil_enable);
 
     glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func),
@@ -1345,25 +1315,24 @@ void RasterizerOpenGL::SyncStencilTestState() {
 }
 
 void RasterizerOpenGL::SyncRasterizeEnable() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::RasterizeEnable]) {
         return;
     }
     flags[Dirty::RasterizeEnable] = false;
 
-    oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0);
+    oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0);
 }
 
 void RasterizerOpenGL::SyncPolygonModes() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PolygonModes]) {
         return;
     }
     flags[Dirty::PolygonModes] = false;
 
-    if (gpu.regs.fill_rectangle) {
+    const auto& regs = maxwell3d.regs;
+    if (regs.fill_rectangle) {
         if (!GLAD_GL_NV_fill_rectangle) {
             LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported");
             glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
@@ -1376,27 +1345,26 @@ void RasterizerOpenGL::SyncPolygonModes() {
         return;
     }
 
-    if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) {
+    if (regs.polygon_mode_front == regs.polygon_mode_back) {
         flags[Dirty::PolygonModeFront] = false;
         flags[Dirty::PolygonModeBack] = false;
-        glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+        glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
         return;
     }
 
     if (flags[Dirty::PolygonModeFront]) {
         flags[Dirty::PolygonModeFront] = false;
-        glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+        glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
     }
 
     if (flags[Dirty::PolygonModeBack]) {
         flags[Dirty::PolygonModeBack] = false;
-        glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back));
+        glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back));
     }
 }
 
 void RasterizerOpenGL::SyncColorMask() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::ColorMasks]) {
         return;
     }
@@ -1405,7 +1373,7 @@ void RasterizerOpenGL::SyncColorMask() {
     const bool force = flags[Dirty::ColorMaskCommon];
     flags[Dirty::ColorMaskCommon] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.color_mask_common) {
         if (!force && !flags[Dirty::ColorMask0]) {
             return;
@@ -1430,33 +1398,30 @@ void RasterizerOpenGL::SyncColorMask() {
 }
 
 void RasterizerOpenGL::SyncMultiSampleState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::MultisampleControl]) {
         return;
     }
     flags[Dirty::MultisampleControl] = false;
 
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage);
     oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one);
 }
 
 void RasterizerOpenGL::SyncFragmentColorClampState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::FragmentClampColor]) {
         return;
     }
     flags[Dirty::FragmentClampColor] = false;
 
-    glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
+    glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
 }
 
 void RasterizerOpenGL::SyncBlendState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     if (flags[Dirty::BlendColor]) {
         flags[Dirty::BlendColor] = false;
@@ -1513,14 +1478,13 @@ void RasterizerOpenGL::SyncBlendState() {
 }
 
 void RasterizerOpenGL::SyncLogicOpState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::LogicOp]) {
         return;
     }
     flags[Dirty::LogicOp] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.logic_op.enable) {
         glEnable(GL_COLOR_LOGIC_OP);
         glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation));
@@ -1530,14 +1494,13 @@ void RasterizerOpenGL::SyncLogicOpState() {
 }
 
 void RasterizerOpenGL::SyncScissorTest() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::Scissors]) {
         return;
     }
     flags[Dirty::Scissors] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) {
         if (!flags[Dirty::Scissor0 + index]) {
             continue;
@@ -1556,16 +1519,15 @@ void RasterizerOpenGL::SyncScissorTest() {
 }
 
 void RasterizerOpenGL::SyncPointState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PointSize]) {
         return;
     }
     flags[Dirty::PointSize] = false;
 
-    oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable);
+    oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
 
-    if (gpu.regs.vp_point_size.enable) {
+    if (maxwell3d.regs.vp_point_size.enable) {
         // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled.
         glEnable(GL_PROGRAM_POINT_SIZE);
         return;
@@ -1573,32 +1535,30 @@ void RasterizerOpenGL::SyncPointState() {
 
     // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
     // in OpenGL).
-    glPointSize(std::max(1.0f, gpu.regs.point_size));
+    glPointSize(std::max(1.0f, maxwell3d.regs.point_size));
     glDisable(GL_PROGRAM_POINT_SIZE);
 }
 
 void RasterizerOpenGL::SyncLineState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::LineWidth]) {
         return;
     }
     flags[Dirty::LineWidth] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable);
     glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased);
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PolygonOffset]) {
         return;
     }
     flags[Dirty::PolygonOffset] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable);
     oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable);
     oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable);
@@ -1612,18 +1572,13 @@ void RasterizerOpenGL::SyncPolygonOffset() {
 }
 
 void RasterizerOpenGL::SyncAlphaTest() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::AlphaTest]) {
         return;
     }
     flags[Dirty::AlphaTest] = false;
 
-    const auto& regs = gpu.regs;
-    if (regs.alpha_test_enabled && regs.rt_control.count > 1) {
-        LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested");
-    }
-
+    const auto& regs = maxwell3d.regs;
     if (regs.alpha_test_enabled) {
         glEnable(GL_ALPHA_TEST);
         glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);
@@ -1633,20 +1588,19 @@ void RasterizerOpenGL::SyncAlphaTest() {
 }
 
 void RasterizerOpenGL::SyncFramebufferSRGB() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::FramebufferSRGB]) {
         return;
     }
     flags[Dirty::FramebufferSRGB] = false;
 
-    oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
+    oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb);
 }
 
 void RasterizerOpenGL::SyncTransformFeedback() {
     // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
     // when this is required.
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     static constexpr std::size_t STRIDE = 3;
     std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
@@ -1698,7 +1652,7 @@ void RasterizerOpenGL::SyncTransformFeedback() {
 }
 
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
@@ -1741,7 +1695,7 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
 }
 
 void RasterizerOpenGL::EndTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index ccc6f50f6..1d0f585fa 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -36,8 +36,8 @@
 #include "video_core/shader/async_shaders.h"
 #include "video_core/textures/texture.h"
 
-namespace Core {
-class System;
+namespace Core::Memory {
+class Memory;
 }
 
 namespace Core::Frontend {
@@ -53,11 +53,19 @@ namespace OpenGL {
 struct ScreenInfo;
 struct DrawParameters;
 
+struct BindlessSSBO {
+    GLuint64EXT address;
+    GLsizei length;
+    GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
+
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
-    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              const Device& device, ScreenInfo& info,
-                              ProgramManager& program_manager, StateTracker& state_tracker);
+    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+                              Core::Memory::Memory& cpu_memory, const Device& device,
+                              ScreenInfo& screen_info, ProgramManager& program_manager,
+                              StateTracker& state_tracker);
     ~RasterizerOpenGL() override;
 
     void Draw(bool is_indexed, bool is_instanced) override;
@@ -83,9 +91,8 @@ public:
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
-    void LoadDiskResources(const std::atomic_bool& stop_loading,
+    void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
-    void SetupDirtyFlags() override;
 
     /// Returns true when there are commands queued to the OpenGL server.
     bool AnyCommandQueued() const {
@@ -126,7 +133,7 @@ private:
 
     /// Configures a global memory buffer.
     void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
-                           std::size_t size, GLuint64EXT* pointer);
+                           size_t size, BindlessSSBO* ssbo);
 
     /// Configures the current textures to use for the draw command.
     void SetupDrawTextures(std::size_t stage_index, Shader* shader);
@@ -237,7 +244,15 @@ private:
 
     void SetupShaders(GLenum primitive_mode);
 
+    Tegra::GPU& gpu;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
+
     const Device& device;
+    ScreenInfo& screen_info;
+    ProgramManager& program_manager;
+    StateTracker& state_tracker;
 
     TextureCacheOpenGL texture_cache;
     ShaderCacheOpenGL shader_cache;
@@ -247,10 +262,6 @@ private:
     OGLBufferCache buffer_cache;
     FenceManagerOpenGL fence_manager;
 
-    Core::System& system;
-    ScreenInfo& screen_info;
-    ProgramManager& program_manager;
-    StateTracker& state_tracker;
     VideoCommon::Shader::AsyncShaders async_shaders;
 
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index a787e27d2..0ebcec427 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <string_view>
 #include <utility>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -82,11 +83,13 @@ void OGLSampler::Release() {
     handle = 0;
 }
 
-void OGLShader::Create(const char* source, GLenum type) {
-    if (handle != 0)
+void OGLShader::Create(std::string_view source, GLenum type) {
+    if (handle != 0) {
         return;
-    if (source == nullptr)
+    }
+    if (source.empty()) {
         return;
+    }
 
     MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
     handle = GLShader::LoadShader(source, type);
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index b05cb641c..f48398669 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <string_view>
 #include <utility>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -127,7 +128,7 @@ public:
         return *this;
     }
 
-    void Create(const char* source, GLenum type);
+    void Create(std::string_view source, GLenum type);
 
     void Release();
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index eb49a36bf..bd56bed0c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -22,6 +22,7 @@
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
@@ -238,12 +239,11 @@ std::unique_ptr<Shader> Shader::CreateStageFromMemory(
     ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) {
     const auto shader_type = GetShaderType(program_type);
 
-    auto& gpu = params.system.GPU();
+    auto& gpu = params.gpu;
     gpu.ShaderNotify().MarkSharderBuilding();
 
     auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D());
-    if (!async_shaders.IsShaderAsync(params.system.GPU()) ||
-        !params.device.UseAsynchronousShaders()) {
+    if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) {
         const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
         // TODO(Rodrigo): Handle VertexA shaders
         // std::optional<ShaderIR> ir_b;
@@ -286,11 +286,10 @@ std::unique_ptr<Shader> Shader::CreateStageFromMemory(
 
 std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
                                                        ProgramCode code) {
-    auto& gpu = params.system.GPU();
+    auto& gpu = params.gpu;
     gpu.ShaderNotify().MarkSharderBuilding();
 
-    auto& engine = gpu.KeplerCompute();
-    auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
+    auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine);
     const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
     const u64 uid = params.unique_identifier;
     auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
@@ -319,15 +318,20 @@ std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
         precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }
 
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                                     Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
-      emu_window{emu_window}, device{device}, disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer,
+                                     Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
+                                     Tegra::Engines::Maxwell3D& maxwell3d_,
+                                     Tegra::Engines::KeplerCompute& kepler_compute_,
+                                     Tegra::MemoryManager& gpu_memory_, const Device& device_)
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_},
+      gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_},
+      kepler_compute{kepler_compute_}, device{device_} {}
 
 ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
 
-void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
+void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
+    disk_cache.BindTitleID(title_id);
     const std::optional transferable = disk_cache.LoadTransferable();
     if (!transferable) {
         return;
@@ -480,21 +484,19 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
 
 Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
                                            VideoCommon::Shader::AsyncShaders& async_shaders) {
-    if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
+    if (!maxwell3d.dirty.flags[Dirty::Shaders]) {
         auto* last_shader = last_shaders[static_cast<std::size_t>(program)];
         if (last_shader->IsBuilt()) {
             return last_shader;
         }
     }
 
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const GPUVAddr address{GetShaderAddress(system, program)};
+    const GPUVAddr address{GetShaderAddress(maxwell3d, program)};
 
     if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) {
         auto completed_work = async_shaders.GetCompletedWork();
         for (auto& work : completed_work) {
             Shader* shader = TryGet(work.cpu_address);
-            auto& gpu = system.GPU();
             gpu.ShaderNotify().MarkShaderComplete();
             if (shader == nullptr) {
                 continue;
@@ -506,14 +508,13 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
                 shader->AsyncGLASMBuilt(std::move(work.program.glasm));
             }
 
+            auto& registry = shader->GetRegistry();
+
             ShaderDiskCacheEntry entry;
             entry.type = work.shader_type;
             entry.code = std::move(work.code);
             entry.code_b = std::move(work.code_b);
             entry.unique_identifier = work.uid;
-
-            auto& registry = shader->GetRegistry();
-
             entry.bound_buffer = registry.GetBoundBuffer();
             entry.graphics_info = registry.GetGraphicsInfo();
             entry.keys = registry.GetKeys();
@@ -524,28 +525,28 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
     }
 
     // Look up shader in the cache based on address
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
+    const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)};
     if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
         return last_shaders[static_cast<std::size_t>(program)] = shader;
     }
 
-    const auto host_ptr{memory_manager.GetPointer(address)};
+    const u8* const host_ptr{gpu_memory.GetPointer(address)};
 
     // No shader found - create a new one
-    ProgramCode code{GetShaderCode(memory_manager, address, host_ptr, false)};
+    ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)};
     ProgramCode code_b;
     if (program == Maxwell::ShaderProgram::VertexA) {
-        const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
-        const u8* host_ptr_b = memory_manager.GetPointer(address_b);
-        code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
+        const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)};
+        const u8* host_ptr_b = gpu_memory.GetPointer(address_b);
+        code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false);
     }
     const std::size_t code_size = code.size() * sizeof(u64);
 
     const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
-    const ShaderParameters params{system,    disk_cache, device,
-                                  *cpu_addr, host_ptr,   unique_identifier};
+    const ShaderParameters params{gpu,       maxwell3d, disk_cache,       device,
+                                  *cpu_addr, host_ptr,  unique_identifier};
 
     std::unique_ptr<Shader> shader;
     const auto found = runtime_cache.find(unique_identifier);
@@ -567,21 +568,20 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
 }
 
 Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
+    const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)};
 
     if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
         return kernel;
     }
 
-    const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const u8* host_ptr{gpu_memory.GetPointer(code_addr)};
+    ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)};
     const std::size_t code_size{code.size() * sizeof(u64)};
     const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
-    const ShaderParameters params{system,    disk_cache, device,
-                                  *cpu_addr, host_ptr,   unique_identifier};
+    const ShaderParameters params{gpu,       kepler_compute, disk_cache,       device,
+                                  *cpu_addr, host_ptr,       unique_identifier};
 
     std::unique_ptr<Shader> kernel;
     const auto found = runtime_cache.find(unique_identifier);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 7528ac686..1708af06a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -25,8 +25,8 @@
 #include "video_core/shader/shader_ir.h"
 #include "video_core/shader_cache.h"
 
-namespace Core {
-class System;
+namespace Tegra {
+class MemoryManager;
 }
 
 namespace Core::Frontend {
@@ -57,11 +57,12 @@ struct PrecompiledShader {
 };
 
 struct ShaderParameters {
-    Core::System& system;
+    Tegra::GPU& gpu;
+    Tegra::Engines::ConstBufferEngineInterface& engine;
     ShaderDiskCacheOpenGL& disk_cache;
     const Device& device;
     VAddr cpu_addr;
-    u8* host_ptr;
+    const u8* host_ptr;
     u64 unique_identifier;
 };
 
@@ -118,12 +119,14 @@ private:
 
 class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
-    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                               Core::Frontend::EmuWindow& emu_window, const Device& device);
+    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window,
+                               Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d,
+                               Tegra::Engines::KeplerCompute& kepler_compute,
+                               Tegra::MemoryManager& gpu_memory, const Device& device);
     ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
-    void LoadDiskCache(const std::atomic_bool& stop_loading,
+    void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
@@ -138,9 +141,13 @@ private:
         const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
         const std::unordered_set<GLenum>& supported_formats);
 
-    Core::System& system;
     Core::Frontend::EmuWindow& emu_window;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
     const Device& device;
+
     ShaderDiskCacheOpenGL disk_cache;
     std::unordered_map<u64, PrecompiledShader> runtime_cache;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 3f75fcd2b..95ca96c8e 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -813,7 +813,7 @@ private:
         const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
         const auto it = transform_feedback.find(location);
         if (it == transform_feedback.end()) {
-            return {};
+            return std::nullopt;
         }
         return it->second.components;
     }
@@ -1295,21 +1295,21 @@ private:
             switch (element) {
             case 0:
                 UNIMPLEMENTED();
-                return {};
+                return std::nullopt;
             case 1:
                 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
-                    return {};
+                    return std::nullopt;
                 }
                 return {{"gl_Layer", Type::Int}};
             case 2:
                 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
-                    return {};
+                    return std::nullopt;
                 }
                 return {{"gl_ViewportIndex", Type::Int}};
             case 3:
                 return {{"gl_PointSize", Type::Float}};
             }
-            return {};
+            return std::nullopt;
         case Attribute::Index::FrontColor:
             return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}};
         case Attribute::Index::FrontSecondaryColor:
@@ -1332,7 +1332,7 @@ private:
                          Type::Float}};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
-            return {};
+            return std::nullopt;
         }
     }
 
@@ -1443,8 +1443,10 @@ private:
                 return expr + ", vec2(0.0), vec2(0.0))";
             case TextureType::TextureCube:
                 return expr + ", vec3(0.0), vec3(0.0))";
+            default:
+                UNREACHABLE();
+                break;
             }
-            UNREACHABLE();
         }
 
         for (const auto& variant : extras) {
@@ -2054,15 +2056,19 @@ private:
     }
 
     Expression Texture(Operation operation) {
-        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
-        ASSERT(meta);
-
-        std::string expr = GenerateTexture(
-            operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}});
-        if (meta->sampler.is_shadow) {
-            expr = "vec4(" + expr + ')';
+        const auto meta = std::get<MetaTexture>(operation.GetMeta());
+        const bool separate_dc = meta.sampler.type == TextureType::TextureCube &&
+                                 meta.sampler.is_array && meta.sampler.is_shadow;
+        // TODO: Replace this with an array and make GenerateTexture use C++20 std::span
+        const std::vector<TextureIR> extras{
+            TextureOffset{},
+            TextureArgument{Type::Float, meta.bias},
+        };
+        std::string expr = GenerateTexture(operation, "", extras, separate_dc);
+        if (meta.sampler.is_shadow) {
+            expr = fmt::format("vec4({})", expr);
         }
-        return {expr + GetSwizzle(meta->element), Type::Float};
+        return {expr + GetSwizzle(meta.element), Type::Float};
     }
 
     Expression TextureLod(Operation operation) {
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 40c0877c1..70dd0c3c6 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -206,13 +206,17 @@ bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const {
                flat_bindless_samplers.size();
 }
 
-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default;
 
 ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
 
+void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) {
+    title_id = title_id_;
+}
+
 std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
     // Skip games without title id
-    const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
+    const bool has_title_id = title_id != 0;
     if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) {
         return std::nullopt;
     }
@@ -313,8 +317,7 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo
             return std::nullopt;
         }
     }
-
-    return std::move(entries);
+    return entries;
 }
 
 void ShaderDiskCacheOpenGL::InvalidateTransferable() {
@@ -474,7 +477,7 @@ std::string ShaderDiskCacheOpenGL::GetBaseDir() const {
 }
 
 std::string ShaderDiskCacheOpenGL::GetTitleID() const {
-    return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID());
+    return fmt::format("{:016X}", title_id);
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index db2bb73bc..aef841c1d 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -21,10 +21,6 @@
 #include "video_core/engines/shader_type.h"
 #include "video_core/shader/registry.h"
 
-namespace Core {
-class System;
-}
-
 namespace Common::FS {
 class IOFile;
 }
@@ -70,9 +66,12 @@ struct ShaderDiskCachePrecompiled {
 
 class ShaderDiskCacheOpenGL {
 public:
-    explicit ShaderDiskCacheOpenGL(Core::System& system);
+    explicit ShaderDiskCacheOpenGL();
     ~ShaderDiskCacheOpenGL();
 
+    /// Binds a title ID for all future operations.
+    void BindTitleID(u64 title_id);
+
     /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
     std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();
 
@@ -157,8 +156,6 @@ private:
         return LoadArrayFromPrecompiled(&object, 1);
     }
 
-    Core::System& system;
-
     // Stores whole precompiled cache which will be read from or saved to the precompiled chache
     // file
     FileSys::VectorVfsFile precompiled_cache_virtual_file;
@@ -168,8 +165,11 @@ private:
     // Stored transferable shaders
     std::unordered_set<u64> stored_transferable;
 
+    /// Title ID to operate on
+    u64 title_id = 0;
+
     // The cache has been loaded at boot
-    bool is_usable{};
+    bool is_usable = false;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 9e74eda0d..4bf0d6090 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <string_view>
 #include <vector>
 #include <glad/glad.h>
 #include "common/assert.h"
@@ -11,7 +12,8 @@
 namespace OpenGL::GLShader {
 
 namespace {
-const char* GetStageDebugName(GLenum type) {
+
+std::string_view StageDebugName(GLenum type) {
     switch (type) {
     case GL_VERTEX_SHADER:
         return "vertex";
@@ -25,12 +27,17 @@ const char* GetStageDebugName(GLenum type) {
     UNIMPLEMENTED();
     return "unknown";
 }
+
 } // Anonymous namespace
 
-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type = GetStageDebugName(type);
+GLuint LoadShader(std::string_view source, GLenum type) {
+    const std::string_view debug_type = StageDebugName(type);
     const GLuint shader_id = glCreateShader(type);
-    glShaderSource(shader_id, 1, &source, nullptr);
+
+    const GLchar* source_string = source.data();
+    const GLint source_length = static_cast<GLint>(source.size());
+
+    glShaderSource(shader_id, 1, &source_string, &source_length);
     LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
     glCompileShader(shader_id);
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 03b7548c2..1b770532e 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -38,7 +38,7 @@ void LogShaderSource(T... shaders) {
  * @param source String of the GLSL shader program
  * @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER)
  */
-GLuint LoadShader(const char* source, GLenum type);
+GLuint LoadShader(std::string_view source, GLenum type);
 
 /**
  * Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader)
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index d24fad3de..6bcf831f2 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -214,10 +214,8 @@ void SetupDirtyMisc(Tables& tables) {
 
 } // Anonymous namespace
 
-StateTracker::StateTracker(Core::System& system) : system{system} {}
-
-void StateTracker::Initialize() {
-    auto& dirty = system.GPU().Maxwell3D().dirty;
+StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
+    auto& dirty = gpu.Maxwell3D().dirty;
     auto& tables = dirty.tables;
     SetupDirtyRenderTargets(tables);
     SetupDirtyColorMasks(tables);
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 0f823288e..9d127548f 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -13,8 +13,8 @@
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/maxwell_3d.h"
 
-namespace Core {
-class System;
+namespace Tegra {
+class GPU;
 }
 
 namespace OpenGL {
@@ -90,9 +90,7 @@ static_assert(Last <= std::numeric_limits<u8>::max());
 
 class StateTracker {
 public:
-    explicit StateTracker(Core::System& system);
-
-    void Initialize();
+    explicit StateTracker(Tegra::GPU& gpu);
 
     void BindIndexBuffer(GLuint new_index_buffer) {
         if (index_buffer == new_index_buffer) {
@@ -103,7 +101,6 @@ public:
     }
 
     void NotifyScreenDrawVertexArray() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::VertexFormats] = true;
         flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
         flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
@@ -117,98 +114,81 @@ public:
     }
 
     void NotifyPolygonModes() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::PolygonModes] = true;
         flags[OpenGL::Dirty::PolygonModeFront] = true;
         flags[OpenGL::Dirty::PolygonModeBack] = true;
     }
 
     void NotifyViewport0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::Viewports] = true;
         flags[OpenGL::Dirty::Viewport0] = true;
     }
 
     void NotifyScissor0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::Scissors] = true;
         flags[OpenGL::Dirty::Scissor0] = true;
     }
 
     void NotifyColorMask0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::ColorMasks] = true;
         flags[OpenGL::Dirty::ColorMask0] = true;
     }
 
     void NotifyBlend0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::BlendStates] = true;
         flags[OpenGL::Dirty::BlendState0] = true;
     }
 
     void NotifyFramebuffer() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[VideoCommon::Dirty::RenderTargets] = true;
     }
 
     void NotifyFrontFace() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::FrontFace] = true;
     }
 
     void NotifyCullTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::CullTest] = true;
     }
 
     void NotifyDepthMask() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::DepthMask] = true;
     }
 
     void NotifyDepthTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::DepthTest] = true;
     }
 
     void NotifyStencilTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::StencilTest] = true;
     }
 
     void NotifyPolygonOffset() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::PolygonOffset] = true;
     }
 
     void NotifyRasterizeEnable() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::RasterizeEnable] = true;
     }
 
     void NotifyFramebufferSRGB() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::FramebufferSRGB] = true;
     }
 
     void NotifyLogicOp() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::LogicOp] = true;
     }
 
     void NotifyClipControl() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::ClipControl] = true;
     }
 
     void NotifyAlphaTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::AlphaTest] = true;
     }
 
 private:
-    Core::System& system;
+    Tegra::Engines::Maxwell3D::DirtyState::Flags& flags;
 
     GLuint index_buffer = 0;
 };
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index f403f388a..a863ef218 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -532,10 +532,12 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
     return texture_view;
 }
 
-TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
-                                       VideoCore::RasterizerInterface& rasterizer,
-                                       const Device& device, StateTracker& state_tracker)
-    : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} {
+TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+                                       Tegra::Engines::Maxwell3D& maxwell3d,
+                                       Tegra::MemoryManager& gpu_memory, const Device& device,
+                                       StateTracker& state_tracker_)
+    : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{
+                                                                                 state_tracker_} {
     src_framebuffer.Create();
     dst_framebuffer.Create();
 }
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index de8f18489..7787134fc 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -129,8 +129,10 @@ private:
 
 class TextureCacheOpenGL final : public TextureCacheBase {
 public:
-    explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                                const Device& device, StateTracker& state_tracker);
+    explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+                                Tegra::Engines::Maxwell3D& maxwell3d,
+                                Tegra::MemoryManager& gpu_memory, const Device& device,
+                                StateTracker& state_tracker);
     ~TextureCacheOpenGL();
 
 protected:
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index fe9bd4b5a..a8be2aa37 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -47,6 +47,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
@@ -70,6 +72,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::Float:
@@ -84,6 +88,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
+        default:
+            break;
         }
         break;
     }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index c39663db7..2ccca1993 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -21,6 +21,8 @@
 #include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
+#include "video_core/host_shaders/opengl_present_frag.h"
+#include "video_core/host_shaders/opengl_present_vert.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
@@ -30,60 +32,6 @@ namespace OpenGL {
 
 namespace {
 
-constexpr std::size_t SWAP_CHAIN_SIZE = 3;
-
-struct Frame {
-    u32 width{};                      /// Width of the frame (to detect resize)
-    u32 height{};                     /// Height of the frame
-    bool color_reloaded{};            /// Texture attachment was recreated (ie: resized)
-    OpenGL::OGLRenderbuffer color{};  /// Buffer shared between the render/present FBO
-    OpenGL::OGLFramebuffer render{};  /// FBO created on the render thread
-    OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread
-    GLsync render_fence{};            /// Fence created on the render thread
-    GLsync present_fence{};           /// Fence created on the presentation thread
-    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
-};
-
-constexpr char VERTEX_SHADER[] = R"(
-#version 430 core
-
-out gl_PerVertex {
-    vec4 gl_Position;
-};
-
-layout (location = 0) in vec2 vert_position;
-layout (location = 1) in vec2 vert_tex_coord;
-layout (location = 0) out vec2 frag_tex_coord;
-
-// This is a truncated 3x3 matrix for 2D transformations:
-// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
-// The third column performs translation.
-// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
-// implicitly be [0, 0, 1]
-layout (location = 0) uniform mat3x2 modelview_matrix;
-
-void main() {
-    // Multiply input position by the rotscale part of the matrix and then manually translate by
-    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
-    // to `vec3(vert_position.xy, 1.0)`
-    gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
-    frag_tex_coord = vert_tex_coord;
-}
-)";
-
-constexpr char FRAGMENT_SHADER[] = R"(
-#version 430 core
-
-layout (location = 0) in vec2 frag_tex_coord;
-layout (location = 0) out vec4 color;
-
-layout (binding = 0) uniform sampler2D color_texture;
-
-void main() {
-    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
-}
-)";
-
 constexpr GLint PositionLocation = 0;
 constexpr GLint TexCoordLocation = 1;
 constexpr GLint ModelViewMatrixLocation = 0;
@@ -96,24 +44,6 @@ struct ScreenRectVertex {
     std::array<GLfloat, 2> tex_coord;
 };
 
-/// Returns true if any debug tool is attached
-bool HasDebugTool() {
-    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
-    if (nsight) {
-        return true;
-    }
-
-    GLint num_extensions;
-    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
-    for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
-        const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
-        if (!std::strcmp(name, "GL_EXT_debug_tool")) {
-            return true;
-        }
-    }
-    return false;
-}
-
 /**
  * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
  * corner and (width, height) on the lower-bottom.
@@ -197,133 +127,15 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
 
 } // Anonymous namespace
 
-/**
- * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
- * but also make sure that rendering happens at the pace that the frontend dictates. This is a
- * helper class that the renderer uses to sync frames between the render thread and the presentation
- * thread
- */
-class FrameMailbox {
-public:
-    std::mutex swap_chain_lock;
-    std::condition_variable present_cv;
-    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
-    std::queue<Frame*> free_queue;
-    std::deque<Frame*> present_queue;
-    Frame* previous_frame{};
-
-    FrameMailbox() {
-        for (auto& frame : swap_chain) {
-            free_queue.push(&frame);
-        }
-    }
-
-    ~FrameMailbox() {
-        // lock the mutex and clear out the present and free_queues and notify any people who are
-        // blocked to prevent deadlock on shutdown
-        std::scoped_lock lock{swap_chain_lock};
-        std::queue<Frame*>().swap(free_queue);
-        present_queue.clear();
-        present_cv.notify_all();
-    }
-
-    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
-        frame->present.Release();
-        frame->present.Create();
-        GLint previous_draw_fbo{};
-        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
-        frame->color_reloaded = false;
-    }
-
-    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
-        // Recreate the color texture attachment
-        frame->color.Release();
-        frame->color.Create();
-        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
-        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
-
-        // Recreate the FBO for the render target
-        frame->render.Release();
-        frame->render.Create();
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
-        }
-
-        frame->width = width;
-        frame->height = height;
-        frame->color_reloaded = true;
-    }
-
-    Frame* GetRenderFrame() {
-        std::unique_lock lock{swap_chain_lock};
-
-        // If theres no free frames, we will reuse the oldest render frame
-        if (free_queue.empty()) {
-            auto frame = present_queue.back();
-            present_queue.pop_back();
-            return frame;
-        }
-
-        Frame* frame = free_queue.front();
-        free_queue.pop();
-        return frame;
-    }
-
-    void ReleaseRenderFrame(Frame* frame) {
-        std::unique_lock lock{swap_chain_lock};
-        present_queue.push_front(frame);
-        present_cv.notify_one();
-    }
-
-    Frame* TryGetPresentFrame(int timeout_ms) {
-        std::unique_lock lock{swap_chain_lock};
-        // wait for new entries in the present_queue
-        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
-                            [&] { return !present_queue.empty(); });
-        if (present_queue.empty()) {
-            // timed out waiting for a frame to draw so return the previous frame
-            return previous_frame;
-        }
-
-        // free the previous frame and add it back to the free queue
-        if (previous_frame) {
-            free_queue.push(previous_frame);
-        }
-
-        // the newest entries are pushed to the front of the queue
-        Frame* frame = present_queue.front();
-        present_queue.pop_front();
-        // remove all old entries from the present queue and move them back to the free_queue
-        for (auto f : present_queue) {
-            free_queue.push(f);
-        }
-        present_queue.clear();
-        previous_frame = frame;
-        return frame;
-    }
-};
-
-RendererOpenGL::RendererOpenGL(Core::System& system_, Core::Frontend::EmuWindow& emu_window_,
-                               Tegra::GPU& gpu_,
-                               std::unique_ptr<Core::Frontend::GraphicsContext> context_)
-    : RendererBase{emu_window_, std::move(context_)}, system{system_},
-      emu_window{emu_window_}, gpu{gpu_}, program_manager{device}, has_debug_tool{HasDebugTool()} {}
+RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
+                               Core::Frontend::EmuWindow& emu_window_,
+                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
+                               std::unique_ptr<Core::Frontend::GraphicsContext> context)
+    : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_},
+      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {}
 
 RendererOpenGL::~RendererOpenGL() = default;
 
-MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64));
-MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128));
-
 void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     if (!framebuffer) {
         return;
@@ -332,79 +144,34 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     PrepareRendertarget(framebuffer);
     RenderScreenshot();
 
-    Frame* frame;
-    {
-        MICROPROFILE_SCOPE(OpenGL_WaitPresent);
-
-        frame = frame_mailbox->GetRenderFrame();
-
-        // Clean up sync objects before drawing
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+    DrawScreen(emu_window.GetFramebufferLayout());
 
-        // INTEL driver workaround. We can't delete the previous render sync object until we are
-        // sure that the presentation is done
-        if (frame->present_fence) {
-            glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
-        }
-
-        // delete the draw fence if the frame wasn't presented
-        if (frame->render_fence) {
-            glDeleteSync(frame->render_fence);
-            frame->render_fence = 0;
-        }
-
-        // wait for the presentation to be done
-        if (frame->present_fence) {
-            glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
-            glDeleteSync(frame->present_fence);
-            frame->present_fence = 0;
-        }
-    }
+    ++m_current_frame;
 
-    {
-        MICROPROFILE_SCOPE(OpenGL_RenderFrame);
-        const auto& layout = render_window.GetFramebufferLayout();
-
-        // Recreate the frame if the size of the window has changed
-        if (layout.width != frame->width || layout.height != frame->height ||
-            screen_info.display_srgb != frame->is_srgb) {
-            LOG_DEBUG(Render_OpenGL, "Reloading render frame");
-            frame->is_srgb = screen_info.display_srgb;
-            frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle);
-        DrawScreen(layout);
-        // Create a fence for the frontend to wait on and swap this frame to OffTex
-        frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-        glFlush();
-        frame_mailbox->ReleaseRenderFrame(frame);
-        m_current_frame++;
-        rasterizer->TickFrame();
-    }
+    rasterizer->TickFrame();
 
     render_window.PollEvents();
-    if (has_debug_tool) {
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
-        Present(0);
-        context->SwapBuffers();
-    }
+    context->SwapBuffers();
 }
 
 void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
-    if (framebuffer) {
-        // If framebuffer is provided, reload it from memory to a texture
-        if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
-            screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
-            screen_info.texture.pixel_format != framebuffer->pixel_format ||
-            gl_framebuffer_data.empty()) {
-            // Reallocate texture if the framebuffer size has changed.
-            // This is expected to not happen very often and hence should not be a
-            // performance problem.
-            ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
-        }
-
-        // Load the framebuffer from memory, draw it to the screen, and swap buffers
-        LoadFBToScreenInfo(*framebuffer);
+    if (!framebuffer) {
+        return;
+    }
+    // If framebuffer is provided, reload it from memory to a texture
+    if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
+        screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
+        screen_info.texture.pixel_format != framebuffer->pixel_format ||
+        gl_framebuffer_data.empty()) {
+        // Reallocate texture if the framebuffer size has changed.
+        // This is expected to not happen very often and hence should not be a
+        // performance problem.
+        ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
     }
+
+    // Load the framebuffer from memory, draw it to the screen, and swap buffers
+    LoadFBToScreenInfo(*framebuffer);
 }
 
 void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
@@ -424,7 +191,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
         VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)};
     const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)};
     const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel};
-    u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)};
+    u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)};
     rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes);
 
     // TODO(Rodrigo): Read this from HLE
@@ -454,17 +221,15 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
 }
 
 void RendererOpenGL::InitOpenGLObjects() {
-    frame_mailbox = std::make_unique<FrameMailbox>();
-
     glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
                  Settings::values.bg_blue.GetValue(), 0.0f);
 
     // Create shader programs
     OGLShader vertex_shader;
-    vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER);
+    vertex_shader.Create(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER);
 
     OGLShader fragment_shader;
-    fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER);
+    fragment_shader.Create(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER);
 
     vertex_program.Create(true, false, vertex_shader.handle);
     fragment_program.Create(true, false, fragment_shader.handle);
@@ -509,7 +274,6 @@ void RendererOpenGL::AddTelemetryFields() {
     LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
     LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
 
-    auto& telemetry_session = system.TelemetrySession();
     constexpr auto user_system = Common::Telemetry::FieldType::UserSystem;
     telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor);
     telemetry_session.AddField(user_system, "GPU_Model", gpu_model);
@@ -520,8 +284,8 @@ void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
-                                                    program_manager, state_tracker);
+    rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device,
+                                                    screen_info, program_manager, state_tracker);
 }
 
 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
@@ -684,51 +448,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     program_manager.RestoreGuestPipeline();
 }
 
-bool RendererOpenGL::TryPresent(int timeout_ms) {
-    if (has_debug_tool) {
-        LOG_DEBUG(Render_OpenGL,
-                  "Skipping presentation because we are presenting on the main context");
-        return false;
-    }
-    return Present(timeout_ms);
-}
-
-bool RendererOpenGL::Present(int timeout_ms) {
-    const auto& layout = render_window.GetFramebufferLayout();
-    auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms);
-    if (!frame) {
-        LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present");
-        return false;
-    }
-
-    // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a
-    // readback since we won't be doing any blending
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    // Recreate the presentation FBO if the color attachment was changed
-    if (frame->color_reloaded) {
-        LOG_DEBUG(Render_OpenGL, "Reloading present frame");
-        frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height);
-    }
-    glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED);
-    // INTEL workaround.
-    // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete
-    // it on the emulation thread without too much penalty
-    // glDeleteSync(frame.render_sync);
-    // frame.render_sync = 0;
-
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle);
-    glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height,
-                      GL_COLOR_BUFFER_BIT, GL_LINEAR);
-
-    // Insert fence for the main thread to block on
-    frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-    glFlush();
-
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
-    return true;
-}
-
 void RendererOpenGL::RenderScreenshot() {
     if (!renderer_settings.screenshot_requested) {
         return;
@@ -743,7 +462,7 @@ void RendererOpenGL::RenderScreenshot() {
     screenshot_framebuffer.Create();
     glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle);
 
-    Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
+    const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
 
     GLuint renderbuffer;
     glGenRenderbuffers(1, &renderbuffer);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 52ea76b7d..9ef181f95 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -16,16 +16,25 @@
 
 namespace Core {
 class System;
-}
+class TelemetrySession;
+} // namespace Core
 
 namespace Core::Frontend {
 class EmuWindow;
 }
 
+namespace Core::Memory {
+class Memory;
+}
+
 namespace Layout {
 struct FramebufferLayout;
 }
 
+namespace Tegra {
+class GPU;
+}
+
 namespace OpenGL {
 
 /// Structure used for storing information about the textures for the Switch screen
@@ -46,17 +55,10 @@ struct ScreenInfo {
     TextureInfo texture;
 };
 
-struct PresentationTexture {
-    u32 width = 0;
-    u32 height = 0;
-    OGLTexture texture;
-};
-
-class FrameMailbox;
-
 class RendererOpenGL final : public VideoCore::RendererBase {
 public:
-    explicit RendererOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
+    explicit RendererOpenGL(Core::TelemetrySession& telemetry_session,
+                            Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory,
                             Tegra::GPU& gpu,
                             std::unique_ptr<Core::Frontend::GraphicsContext> context);
     ~RendererOpenGL() override;
@@ -64,7 +66,6 @@ public:
     bool Init() override;
     void ShutDown() override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    bool TryPresent(int timeout_ms) override;
 
 private:
     /// Initializes the OpenGL state and creates persistent objects.
@@ -92,14 +93,13 @@ private:
 
     void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer);
 
-    bool Present(int timeout_ms);
-
-    Core::System& system;
+    Core::TelemetrySession& telemetry_session;
     Core::Frontend::EmuWindow& emu_window;
+    Core::Memory::Memory& cpu_memory;
     Tegra::GPU& gpu;
-    const Device device;
 
-    StateTracker state_tracker{system};
+    const Device device;
+    StateTracker state_tracker{gpu};
 
     // OpenGL object IDs
     OGLBuffer vertex_buffer;
@@ -123,11 +123,6 @@ private:
     /// Used for transforming the framebuffer orientation
     Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{};
     Common::Rectangle<int> framebuffer_crop_rect;
-
-    /// Frame presentation mailbox
-    std::unique_ptr<FrameMailbox> frame_mailbox;
-
-    bool has_debug_tool = false;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index 81a39a3b8..da5c550ea 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -58,6 +58,7 @@ void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_sta
     logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0);
     logic_op.Assign(PackLogicOp(regs.logic_op.operation));
     rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0);
+    topology.Assign(regs.draw.topology);
 
     std::memcpy(&point_size, &regs.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast
 
@@ -131,7 +132,6 @@ void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size
 }
 
 void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) {
-    const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
     u32 packed_front_face = PackFrontFace(regs.front_face);
     if (regs.screen_y_control.triangle_rast_flip != 0) {
         // Flip front face
@@ -161,7 +161,6 @@ void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) {
     depth_test_enable.Assign(regs.depth_test_enable);
     front_face.Assign(packed_front_face);
     depth_test_func.Assign(PackComparisonOp(regs.depth_test_func));
-    topology.Assign(topology_index);
     cull_face.Assign(PackCullFace(regs.cull_face));
     cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0);
 
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
index cdcbb65f5..2c18eeaae 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -150,9 +150,8 @@ struct FixedPipelineState {
         };
         union {
             u32 raw2;
-            BitField<0, 4, u32> topology;
-            BitField<4, 2, u32> cull_face;
-            BitField<6, 1, u32> cull_enable;
+            BitField<0, 2, u32> cull_face;
+            BitField<2, 1, u32> cull_enable;
         };
         std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings;
 
@@ -169,10 +168,6 @@ struct FixedPipelineState {
         Maxwell::FrontFace FrontFace() const noexcept {
             return UnpackFrontFace(front_face.Value());
         }
-
-        constexpr Maxwell::PrimitiveTopology Topology() const noexcept {
-            return static_cast<Maxwell::PrimitiveTopology>(topology.Value());
-        }
     };
 
     union {
@@ -190,6 +185,7 @@ struct FixedPipelineState {
         BitField<18, 1, u32> logic_op_enable;
         BitField<19, 4, u32> logic_op;
         BitField<23, 1, u32> rasterize_enable;
+        BitField<24, 4, Maxwell::PrimitiveTopology> topology;
     };
     u32 point_size;
     std::array<u32, Maxwell::NumVertexArrays> binding_divisors;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index f8c77f4fa..d22de1d81 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -78,9 +78,10 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-    return {};
 }
 
 VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -298,9 +299,10 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
         return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
     case Maxwell::PrimitiveTopology::Patches:
         return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-    return {};
 }
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
@@ -325,6 +327,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_UNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
@@ -347,6 +351,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
@@ -369,6 +375,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::SignedScaled:
@@ -391,6 +399,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -421,6 +431,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32A32_UINT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_UINT_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::SignedInt:
@@ -451,6 +463,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32A32_SINT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SINT_PACK32;
+        default:
+            break;
         }
         break;
     case Maxwell::VertexAttribute::Type::Float:
@@ -471,6 +485,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
+        default:
+            break;
         }
         break;
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index ae46e0444..f2610868e 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -25,9 +25,9 @@
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
@@ -56,7 +56,7 @@ VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
                        VkDebugUtilsMessageTypeFlagsEXT type,
                        const VkDebugUtilsMessengerCallbackDataEXT* data,
                        [[maybe_unused]] void* user_data) {
-    const char* message{data->pMessage};
+    const char* const message{data->pMessage};
 
     if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
         LOG_CRITICAL(Render_Vulkan, "{}", message);
@@ -86,15 +86,15 @@ Common::DynamicLibrary OpenVulkanLibrary() {
     if (!library.Open(filename.c_str())) {
         // Android devices may not have libvulkan.so.1, only libvulkan.so.
         filename = Common::DynamicLibrary::GetVersionedFilename("vulkan");
-        library.Open(filename.c_str());
+        (void)library.Open(filename.c_str());
     }
 #endif
     return library;
 }
 
-vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatch& dld,
-                            WindowSystemType window_type = WindowSystemType::Headless,
-                            bool enable_layers = false) {
+std::pair<vk::Instance, u32> CreateInstance(
+    Common::DynamicLibrary& library, vk::InstanceDispatch& dld,
+    WindowSystemType window_type = WindowSystemType::Headless, bool enable_layers = false) {
     if (!library.IsOpen()) {
         LOG_ERROR(Render_Vulkan, "Vulkan library not available");
         return {};
@@ -180,7 +180,10 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
+    // Limit the maximum version of Vulkan to avoid using untested version.
+    const u32 version = std::min(vk::AvailableVersion(dld), static_cast<u32>(VK_API_VERSION_1_1));
+
+    vk::Instance instance = vk::Instance::Create(version, layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
         return {};
@@ -188,7 +191,7 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
     if (!vk::Load(*instance, dld)) {
         LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers");
     }
-    return instance;
+    return std::make_pair(std::move(instance), version);
 }
 
 std::string GetReadableVersion(u32 version) {
@@ -237,10 +240,12 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext
 
 } // Anonymous namespace
 
-RendererVulkan::RendererVulkan(Core::System& system_, Core::Frontend::EmuWindow& emu_window,
-                               Tegra::GPU& gpu_,
+RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
+                               Core::Frontend::EmuWindow& emu_window,
+                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
                                std::unique_ptr<Core::Frontend::GraphicsContext> context)
-    : RendererBase{emu_window, std::move(context)}, system{system_}, gpu{gpu_} {}
+    : RendererBase{emu_window, std::move(context)}, telemetry_session{telemetry_session_},
+      cpu_memory{cpu_memory_}, gpu{gpu_} {}
 
 RendererVulkan::~RendererVulkan() {
     ShutDown();
@@ -267,11 +272,11 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
         scheduler->WaitWorker();
 
         swapchain->AcquireNextImage();
-        const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated);
+        const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated);
 
-        scheduler->Flush(false, render_semaphore);
+        scheduler->Flush(render_semaphore);
 
-        if (swapchain->Present(render_semaphore, fence)) {
+        if (swapchain->Present(render_semaphore)) {
             blit_screen->Recreate();
         }
 
@@ -281,15 +286,10 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     render_window.PollEvents();
 }
 
-bool RendererVulkan::TryPresent(int /*timeout_ms*/) {
-    // TODO (bunnei): ImplementMe
-    return true;
-}
-
 bool RendererVulkan::Init() {
     library = OpenVulkanLibrary();
-    instance = CreateInstance(library, dld, render_window.GetWindowInfo().type,
-                              Settings::values.renderer_debug);
+    std::tie(instance, instance_version) = CreateInstance(
+        library, dld, render_window.GetWindowInfo().type, Settings::values.renderer_debug);
     if (!instance || !CreateDebugCallback() || !CreateSurface() || !PickDevices()) {
         return false;
     }
@@ -298,23 +298,21 @@ bool RendererVulkan::Init() {
 
     memory_manager = std::make_unique<VKMemoryManager>(*device);
 
-    resource_manager = std::make_unique<VKResourceManager>(*device);
+    state_tracker = std::make_unique<StateTracker>(gpu);
+
+    scheduler = std::make_unique<VKScheduler>(*device, *state_tracker);
 
     const auto& framebuffer = render_window.GetFramebufferLayout();
-    swapchain = std::make_unique<VKSwapchain>(*surface, *device);
+    swapchain = std::make_unique<VKSwapchain>(*surface, *device, *scheduler);
     swapchain->Create(framebuffer.width, framebuffer.height, false);
 
-    state_tracker = std::make_unique<StateTracker>(system);
-
-    scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker);
-
-    rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device,
-                                                    *resource_manager, *memory_manager,
-                                                    *state_tracker, *scheduler);
+    rasterizer = std::make_unique<RasterizerVulkan>(render_window, gpu, gpu.MemoryManager(),
+                                                    cpu_memory, screen_info, *device,
+                                                    *memory_manager, *state_tracker, *scheduler);
 
-    blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device,
-                                                 *resource_manager, *memory_manager, *swapchain,
-                                                 *scheduler, screen_info);
+    blit_screen =
+        std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device,
+                                       *memory_manager, *swapchain, *scheduler, screen_info);
 
     return true;
 }
@@ -332,7 +330,6 @@ void RendererVulkan::ShutDown() {
     scheduler.reset();
     swapchain.reset();
     memory_manager.reset();
-    resource_manager.reset();
     device.reset();
 }
 
@@ -422,7 +419,8 @@ bool RendererVulkan::PickDevices() {
         return false;
     }
 
-    device = std::make_unique<VKDevice>(*instance, physical_device, *surface, dld);
+    device =
+        std::make_unique<VKDevice>(*instance, instance_version, physical_device, *surface, dld);
     return device->Create();
 }
 
@@ -432,7 +430,7 @@ void RendererVulkan::Report() const {
     const std::string driver_version = GetDriverVersion(*device);
     const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version);
 
-    const std::string api_version = GetReadableVersion(device->GetApiVersion());
+    const std::string api_version = GetReadableVersion(device->ApiVersion());
 
     const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions());
 
@@ -440,8 +438,7 @@ void RendererVulkan::Report() const {
     LOG_INFO(Render_Vulkan, "Device: {}", model_name);
     LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version);
 
-    auto& telemetry_session = system.TelemetrySession();
-    constexpr auto field = Common::Telemetry::FieldType::UserSystem;
+    static constexpr auto field = Common::Telemetry::FieldType::UserSystem;
     telemetry_session.AddField(field, "GPU_Vendor", vendor_name);
     telemetry_session.AddField(field, "GPU_Model", model_name);
     telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name);
@@ -452,7 +449,7 @@ void RendererVulkan::Report() const {
 std::vector<std::string> RendererVulkan::EnumerateDevices() {
     vk::InstanceDispatch dld;
     Common::DynamicLibrary library = OpenVulkanLibrary();
-    vk::Instance instance = CreateInstance(library, dld);
+    vk::Instance instance = CreateInstance(library, dld).first;
     if (!instance) {
         return {};
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 13debbbc0..1044ca124 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -14,7 +14,15 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Core {
-class System;
+class TelemetrySession;
+}
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Tegra {
+class GPU;
 }
 
 namespace Vulkan {
@@ -22,9 +30,7 @@ namespace Vulkan {
 class StateTracker;
 class VKBlitScreen;
 class VKDevice;
-class VKFence;
 class VKMemoryManager;
-class VKResourceManager;
 class VKSwapchain;
 class VKScheduler;
 class VKImage;
@@ -38,7 +44,8 @@ struct VKScreenInfo {
 
 class RendererVulkan final : public VideoCore::RendererBase {
 public:
-    explicit RendererVulkan(Core::System& system, Core::Frontend::EmuWindow& emu_window,
+    explicit RendererVulkan(Core::TelemetrySession& telemtry_session,
+                            Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory,
                             Tegra::GPU& gpu,
                             std::unique_ptr<Core::Frontend::GraphicsContext> context);
     ~RendererVulkan() override;
@@ -46,7 +53,6 @@ public:
     bool Init() override;
     void ShutDown() override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    bool TryPresent(int timeout_ms) override;
 
     static std::vector<std::string> EnumerateDevices();
 
@@ -59,24 +65,26 @@ private:
 
     void Report() const;
 
-    Core::System& system;
+    Core::TelemetrySession& telemetry_session;
+    Core::Memory::Memory& cpu_memory;
     Tegra::GPU& gpu;
 
     Common::DynamicLibrary library;
     vk::InstanceDispatch dld;
 
     vk::Instance instance;
+    u32 instance_version{};
+
     vk::SurfaceKHR surface;
 
     VKScreenInfo screen_info;
 
     vk::DebugCallback debug_callback;
     std::unique_ptr<VKDevice> device;
-    std::unique_ptr<VKSwapchain> swapchain;
     std::unique_ptr<VKMemoryManager> memory_manager;
-    std::unique_ptr<VKResourceManager> resource_manager;
     std::unique_ptr<StateTracker> state_tracker;
     std::unique_ptr<VKScheduler> scheduler;
+    std::unique_ptr<VKSwapchain> swapchain;
     std::unique_ptr<VKBlitScreen> blit_screen;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index a551e3de8..b5b60309e 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -12,11 +12,9 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
-
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/memory.h"
-
 #include "video_core/gpu.h"
 #include "video_core/morton.h"
 #include "video_core/rasterizer_interface.h"
@@ -24,8 +22,8 @@
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_image.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_shader_util.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
@@ -210,17 +208,15 @@ struct VKBlitScreen::BufferData {
     // Unaligned image data goes here
 };
 
-VKBlitScreen::VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window,
-                           VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
-                           VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                           VKSwapchain& swapchain, VKScheduler& scheduler,
-                           const VKScreenInfo& screen_info)
-    : system{system}, render_window{render_window}, rasterizer{rasterizer}, device{device},
-      resource_manager{resource_manager}, memory_manager{memory_manager}, swapchain{swapchain},
-      scheduler{scheduler}, image_count{swapchain.GetImageCount()}, screen_info{screen_info} {
-    watches.resize(image_count);
-    std::generate(watches.begin(), watches.end(),
-                  []() { return std::make_unique<VKFenceWatch>(); });
+VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_,
+                           Core::Frontend::EmuWindow& render_window_,
+                           VideoCore::RasterizerInterface& rasterizer_, const VKDevice& device_,
+                           VKMemoryManager& memory_manager_, VKSwapchain& swapchain_,
+                           VKScheduler& scheduler_, const VKScreenInfo& screen_info_)
+    : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_},
+      device{device_}, memory_manager{memory_manager_}, swapchain{swapchain_},
+      scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} {
+    resource_ticks.resize(image_count);
 
     CreateStaticResources();
     CreateDynamicResources();
@@ -232,15 +228,16 @@ void VKBlitScreen::Recreate() {
     CreateDynamicResources();
 }
 
-std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
-                                                     bool use_accelerated) {
+VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated) {
     RefreshResources(framebuffer);
 
     // Finish any pending renderpass
     scheduler.RequestOutsideRenderPassOperationContext();
 
     const std::size_t image_index = swapchain.GetImageIndex();
-    watches[image_index]->Watch(scheduler.GetFence());
+
+    scheduler.Wait(resource_ticks[image_index]);
+    resource_ticks[image_index] = scheduler.CurrentTick();
 
     VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get();
 
@@ -259,7 +256,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
         const auto pixel_format =
             VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format);
         const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
-        const auto host_ptr = system.Memory().GetPointer(framebuffer_addr);
+        const auto host_ptr = cpu_memory.GetPointer(framebuffer_addr);
         rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer));
 
         // TODO(Rodrigo): Read this from HLE
@@ -343,7 +340,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
         cmdbuf.EndRenderPass();
     });
 
-    return {scheduler.GetFence(), *semaphores[image_index]};
+    return *semaphores[image_index];
 }
 
 void VKBlitScreen::CreateStaticResources() {
@@ -711,7 +708,7 @@ void VKBlitScreen::CreateFramebuffers() {
 
 void VKBlitScreen::ReleaseRawImages() {
     for (std::size_t i = 0; i < raw_images.size(); ++i) {
-        watches[i]->Wait();
+        scheduler.Wait(resource_ticks.at(i));
     }
     raw_images.clear();
     raw_buffer_commits.clear();
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h
index 243640fab..8f2839214 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -5,16 +5,18 @@
 #pragma once
 
 #include <memory>
-#include <tuple>
 
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Core {
 class System;
 }
 
+namespace Core::Memory {
+class Memory;
+}
+
 namespace Core::Frontend {
 class EmuWindow;
 }
@@ -30,26 +32,26 @@ class RasterizerInterface;
 namespace Vulkan {
 
 struct ScreenInfo;
+
 class RasterizerVulkan;
 class VKDevice;
-class VKFence;
 class VKImage;
 class VKScheduler;
 class VKSwapchain;
 
 class VKBlitScreen final {
 public:
-    explicit VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window,
+    explicit VKBlitScreen(Core::Memory::Memory& cpu_memory,
+                          Core::Frontend::EmuWindow& render_window,
                           VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
-                          VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                          VKSwapchain& swapchain, VKScheduler& scheduler,
-                          const VKScreenInfo& screen_info);
+                          VKMemoryManager& memory_manager, VKSwapchain& swapchain,
+                          VKScheduler& scheduler, const VKScreenInfo& screen_info);
     ~VKBlitScreen();
 
     void Recreate();
 
-    std::tuple<VKFence&, VkSemaphore> Draw(const Tegra::FramebufferConfig& framebuffer,
-                                           bool use_accelerated);
+    [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer,
+                                   bool use_accelerated);
 
 private:
     struct BufferData;
@@ -81,11 +83,10 @@ private:
     u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer,
                           std::size_t image_index) const;
 
-    Core::System& system;
+    Core::Memory::Memory& cpu_memory;
     Core::Frontend::EmuWindow& render_window;
     VideoCore::RasterizerInterface& rasterizer;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKSwapchain& swapchain;
     VKScheduler& scheduler;
@@ -106,7 +107,7 @@ private:
     vk::Buffer buffer;
     VKMemoryCommit buffer_commit;
 
-    std::vector<std::unique_ptr<VKFenceWatch>> watches;
+    std::vector<u64> resource_ticks;
 
     std::vector<vk::Semaphore> semaphores;
     std::vector<std::unique_ptr<VKImage>> raw_images;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 1d2f8b557..d9d3da9ea 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -145,14 +145,15 @@ void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst
     });
 }
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                             Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
+                             const VKDevice& device_, VKMemoryManager& memory_manager_,
+                             VKScheduler& scheduler_, VKStagingBufferPool& staging_pool_)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, gpu_memory, cpu_memory,
+                                                                 CreateStreamBuffer(device_,
+                                                                                    scheduler_)},
+      device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{
+                                                                                   staging_pool_} {}
 
 VKBufferCache::~VKBufferCache() = default;
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 991ee451c..7fb5ceedf 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -13,10 +13,6 @@
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
-namespace Core {
-class System;
-}
-
 namespace Vulkan {
 
 class VKDevice;
@@ -53,7 +49,8 @@ private:
 
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
-    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                           Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                            const VKDevice& device, VKMemoryManager& memory_manager,
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp
new file mode 100644
index 000000000..6339f4fe0
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp
@@ -0,0 +1,46 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+
+#include "video_core/renderer_vulkan/vk_command_pool.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+constexpr size_t COMMAND_BUFFER_POOL_SIZE = 0x1000;
+
+struct CommandPool::Pool {
+    vk::CommandPool handle;
+    vk::CommandBuffers cmdbufs;
+};
+
+CommandPool::CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device)
+    : ResourcePool(master_semaphore, COMMAND_BUFFER_POOL_SIZE), device{device} {}
+
+CommandPool::~CommandPool() = default;
+
+void CommandPool::Allocate(size_t begin, size_t end) {
+    // Command buffers are going to be commited, recorded, executed every single usage cycle.
+    // They are also going to be reseted when commited.
+    Pool& pool = pools.emplace_back();
+    pool.handle = device.GetLogical().CreateCommandPool({
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags =
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = device.GetGraphicsFamily(),
+    });
+    pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE);
+}
+
+VkCommandBuffer CommandPool::Commit() {
+    const size_t index = CommitResource();
+    const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE;
+    const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE;
+    return pools[pool_index].cmdbufs[sub_index];
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.h b/src/video_core/renderer_vulkan/vk_command_pool.h
new file mode 100644
index 000000000..b9cb3fb5d
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_command_pool.h
@@ -0,0 +1,34 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+class MasterSemaphore;
+class VKDevice;
+
+class CommandPool final : public ResourcePool {
+public:
+    explicit CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device);
+    ~CommandPool() override;
+
+    void Allocate(size_t begin, size_t end) override;
+
+    VkCommandBuffer Commit();
+
+private:
+    struct Pool;
+
+    const VKDevice& device;
+    std::vector<Pool> pools;
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 182461ed9..9637c6059 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -112,7 +112,8 @@ constexpr u8 quad_array[] = {
     0xf9, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00,
-    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
 
 VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
     return {
@@ -218,7 +219,8 @@ constexpr u8 uint8_pass[] = {
     0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
     0x24, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00,
-    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
 
 // Quad indexed SPIR-V module. Generated from the "shaders/" directory.
 constexpr u8 QUAD_INDEXED_SPV[] = {
@@ -341,7 +343,8 @@ constexpr u8 QUAD_INDEXED_SPV[] = {
     0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00,
-    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
 
 std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBindings() {
     return {{
@@ -448,12 +451,12 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto
 
 VKComputePass::~VKComputePass() = default;
 
-VkDescriptorSet VKComputePass::CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue,
-                                                   VKFence& fence) {
+VkDescriptorSet VKComputePass::CommitDescriptorSet(
+    VKUpdateDescriptorQueue& update_descriptor_queue) {
     if (!descriptor_template) {
         return nullptr;
     }
-    const auto set = descriptor_allocator->Commit(fence);
+    const VkDescriptorSet set = descriptor_allocator->Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
@@ -477,7 +480,7 @@ std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32
 
     update_descriptor_queue.Acquire();
     update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
-    const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence());
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
 
     scheduler.RequestOutsideRenderPassOperationContext();
 
@@ -520,13 +523,13 @@ Uint8Pass::~Uint8Pass() = default;
 
 std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
                                              u64 src_offset) {
-    const auto staging_size = static_cast<u32>(num_vertices * sizeof(u16));
+    const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
     auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false);
 
     update_descriptor_queue.Acquire();
     update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
     update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
-    const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence());
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
 
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set,
@@ -589,7 +592,7 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
     update_descriptor_queue.Acquire();
     update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
     update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
-    const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence());
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
 
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index 230b526bc..acc94f27e 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -15,7 +15,6 @@
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
 class VKScheduler;
 class VKStagingBufferPool;
 class VKUpdateDescriptorQueue;
@@ -30,8 +29,7 @@ public:
     ~VKComputePass();
 
 protected:
-    VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue,
-                                        VKFence& fence);
+    VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue);
 
     vk::DescriptorUpdateTemplateKHR descriptor_template;
     vk::PipelineLayout layout;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index ed9d2991c..9be72dc9b 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -32,7 +32,7 @@ VkDescriptorSet VKComputePipeline::CommitDescriptorSet() {
     if (!descriptor_template) {
         return {};
     }
-    const auto set = descriptor_allocator.Commit(scheduler.GetFence());
+    const VkDescriptorSet set = descriptor_allocator.Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index ac4a0884e..f38e089d5 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -7,7 +7,8 @@
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
@@ -15,14 +16,15 @@ namespace Vulkan {
 // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines.
 constexpr std::size_t SETS_GROW_RATE = 0x20;
 
-DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool,
-                                         VkDescriptorSetLayout layout)
-    : VKFencedPool{SETS_GROW_RATE}, descriptor_pool{descriptor_pool}, layout{layout} {}
+DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool_,
+                                         VkDescriptorSetLayout layout_)
+    : ResourcePool(descriptor_pool_.master_semaphore, SETS_GROW_RATE),
+      descriptor_pool{descriptor_pool_}, layout{layout_} {}
 
 DescriptorAllocator::~DescriptorAllocator() = default;
 
-VkDescriptorSet DescriptorAllocator::Commit(VKFence& fence) {
-    const std::size_t index = CommitResource(fence);
+VkDescriptorSet DescriptorAllocator::Commit() {
+    const std::size_t index = CommitResource();
     return descriptors_allocations[index / SETS_GROW_RATE][index % SETS_GROW_RATE];
 }
 
@@ -30,8 +32,9 @@ void DescriptorAllocator::Allocate(std::size_t begin, std::size_t end) {
     descriptors_allocations.push_back(descriptor_pool.AllocateDescriptors(layout, end - begin));
 }
 
-VKDescriptorPool::VKDescriptorPool(const VKDevice& device)
-    : device{device}, active_pool{AllocateNewPool()} {}
+VKDescriptorPool::VKDescriptorPool(const VKDevice& device_, VKScheduler& scheduler)
+    : device{device_}, master_semaphore{scheduler.GetMasterSemaphore()}, active_pool{
+                                                                             AllocateNewPool()} {}
 
 VKDescriptorPool::~VKDescriptorPool() = default;
 
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.h b/src/video_core/renderer_vulkan/vk_descriptor_pool.h
index 9efa66bef..544f32a20 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.h
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.h
@@ -6,21 +6,24 @@
 
 #include <vector>
 
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
+class VKDevice;
 class VKDescriptorPool;
+class VKScheduler;
 
-class DescriptorAllocator final : public VKFencedPool {
+class DescriptorAllocator final : public ResourcePool {
 public:
     explicit DescriptorAllocator(VKDescriptorPool& descriptor_pool, VkDescriptorSetLayout layout);
     ~DescriptorAllocator() override;
 
+    DescriptorAllocator& operator=(const DescriptorAllocator&) = delete;
     DescriptorAllocator(const DescriptorAllocator&) = delete;
 
-    VkDescriptorSet Commit(VKFence& fence);
+    VkDescriptorSet Commit();
 
 protected:
     void Allocate(std::size_t begin, std::size_t end) override;
@@ -36,15 +39,19 @@ class VKDescriptorPool final {
     friend DescriptorAllocator;
 
 public:
-    explicit VKDescriptorPool(const VKDevice& device);
+    explicit VKDescriptorPool(const VKDevice& device, VKScheduler& scheduler);
     ~VKDescriptorPool();
 
+    VKDescriptorPool(const VKDescriptorPool&) = delete;
+    VKDescriptorPool& operator=(const VKDescriptorPool&) = delete;
+
 private:
     vk::DescriptorPool* AllocateNewPool();
 
     vk::DescriptorSets AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count);
 
     const VKDevice& device;
+    MasterSemaphore& master_semaphore;
 
     std::vector<vk::DescriptorPool> pools;
     vk::DescriptorPool* active_pool;
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index ebcfaa0e3..f34ed6735 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -38,10 +38,14 @@ constexpr std::array Depth16UnormS8_UINT{
 
 constexpr std::array REQUIRED_EXTENSIONS{
     VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+    VK_KHR_MAINTENANCE1_EXTENSION_NAME,
+    VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
+    VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
     VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
     VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
     VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME,
     VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME,
+    VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME,
     VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
     VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
     VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
@@ -78,6 +82,21 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
     }
 }
 
+[[nodiscard]] bool IsRDNA(std::string_view device_name, VkDriverIdKHR driver_id) {
+    static constexpr std::array RDNA_DEVICES{
+        "5700",
+        "5600",
+        "5500",
+        "5300",
+    };
+    if (driver_id != VK_DRIVER_ID_AMD_PROPRIETARY_KHR) {
+        return false;
+    }
+    return std::any_of(RDNA_DEVICES.begin(), RDNA_DEVICES.end(), [device_name](const char* name) {
+        return device_name.find(name) != std::string_view::npos;
+    });
+}
+
 std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
     vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
     static constexpr std::array formats{
@@ -171,10 +190,10 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
 
 } // Anonymous namespace
 
-VKDevice::VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface,
-                   const vk::InstanceDispatch& dld)
-    : dld{dld}, physical{physical}, properties{physical.GetProperties()},
-      format_properties{GetFormatProperties(physical, dld)} {
+VKDevice::VKDevice(VkInstance instance_, u32 instance_version_, vk::PhysicalDevice physical_,
+                   VkSurfaceKHR surface, const vk::InstanceDispatch& dld_)
+    : dld{dld_}, physical{physical_}, properties{physical.GetProperties()},
+      instance_version{instance_version_}, format_properties{GetFormatProperties(physical, dld)} {
     SetupFamilies(surface);
     SetupFeatures();
 }
@@ -250,6 +269,13 @@ bool VKDevice::Create() {
         .inheritedQueries = false,
     };
 
+    VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR,
+        .pNext = nullptr,
+        .timelineSemaphore = true,
+    };
+    SetNext(next, timeline_semaphore);
+
     VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage{
         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR,
         .pNext = nullptr,
@@ -380,6 +406,15 @@ bool VKDevice::Create() {
 
     CollectTelemetryParameters();
 
+    if (ext_extended_dynamic_state && IsRDNA(properties.deviceName, driver_id)) {
+        // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but on RDNA devices it
+        // seems to cause stability issues
+        LOG_WARNING(
+            Render_Vulkan,
+            "Blacklisting AMD proprietary on RDNA devices from VK_EXT_extended_dynamic_state");
+        ext_extended_dynamic_state = false;
+    }
+
     graphics_queue = logical.GetQueue(graphics_family);
     present_queue = logical.GetQueue(present_family);
 
@@ -565,20 +600,6 @@ bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) {
 
 std::vector<const char*> VKDevice::LoadExtensions() {
     std::vector<const char*> extensions;
-    const auto Test = [&](const VkExtensionProperties& extension,
-                          std::optional<std::reference_wrapper<bool>> status, const char* name,
-                          bool push) {
-        if (extension.extensionName != std::string_view(name)) {
-            return;
-        }
-        if (push) {
-            extensions.push_back(name);
-        }
-        if (status) {
-            status->get() = true;
-        }
-    };
-
     extensions.reserve(7 + REQUIRED_EXTENSIONS.size());
     extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end());
 
@@ -587,28 +608,36 @@ std::vector<const char*> VKDevice::LoadExtensions() {
     bool has_ext_transform_feedback{};
     bool has_ext_custom_border_color{};
     bool has_ext_extended_dynamic_state{};
-    for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) {
-        Test(extension, nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true);
-        Test(extension, khr_uniform_buffer_standard_layout,
+    for (const VkExtensionProperties& extension : physical.EnumerateDeviceExtensionProperties()) {
+        const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name,
+                              bool push) {
+            if (extension.extensionName != std::string_view(name)) {
+                return;
+            }
+            if (push) {
+                extensions.push_back(name);
+            }
+            if (status) {
+                status->get() = true;
+            }
+        };
+        test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true);
+        test(khr_uniform_buffer_standard_layout,
              VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
-        Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
-             false);
-        Test(extension, ext_depth_range_unrestricted,
-             VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
-        Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
-        Test(extension, ext_shader_viewport_index_layer,
-             VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
-        Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
-             false);
-        Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
-             false);
-        Test(extension, has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME,
-             false);
-        Test(extension, has_ext_extended_dynamic_state,
-             VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false);
+        test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
+        test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
+        test(ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME,
+             true);
+        test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false);
+        test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false);
+        test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false);
+        if (instance_version >= VK_API_VERSION_1_1) {
+            test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false);
+        }
         if (Settings::values.renderer_debug) {
-            Test(extension, nv_device_diagnostics_config,
-                 VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, true);
+            test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME,
+                 true);
         }
     }
 
@@ -742,13 +771,18 @@ void VKDevice::CollectTelemetryParameters() {
     VkPhysicalDeviceDriverPropertiesKHR driver{
         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR,
         .pNext = nullptr,
+        .driverID = {},
+        .driverName = {},
+        .driverInfo = {},
+        .conformanceVersion = {},
     };
 
-    VkPhysicalDeviceProperties2KHR properties{
+    VkPhysicalDeviceProperties2KHR device_properties{
         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
         .pNext = &driver,
+        .properties = {},
     };
-    physical.GetProperties2KHR(properties);
+    physical.GetProperties2KHR(device_properties);
 
     driver_id = driver.driverID;
     vendor_name = driver.driverName;
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 26a233db1..4286673d9 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -24,8 +24,8 @@ const u32 GuestWarpSize = 32;
 /// Handles data specific to a physical device.
 class VKDevice final {
 public:
-    explicit VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface,
-                      const vk::InstanceDispatch& dld);
+    explicit VKDevice(VkInstance instance, u32 instance_version, vk::PhysicalDevice physical,
+                      VkSurfaceKHR surface, const vk::InstanceDispatch& dld);
     ~VKDevice();
 
     /// Initializes the device. Returns true on success.
@@ -82,8 +82,13 @@ public:
         return present_family;
     }
 
+    /// Returns the current instance Vulkan API version in Vulkan-formatted version numbers.
+    u32 InstanceApiVersion() const {
+        return instance_version;
+    }
+
     /// Returns the current Vulkan API version provided in Vulkan-formatted version numbers.
-    u32 GetApiVersion() const {
+    u32 ApiVersion() const {
         return properties.apiVersion;
     }
 
@@ -239,6 +244,7 @@ private:
     vk::Device logical;                     ///< Logical device.
     vk::Queue graphics_queue;               ///< Main graphics queue.
     vk::Queue present_queue;                ///< Main present queue.
+    u32 instance_version{};                 ///< Vulkan onstance version.
     u32 graphics_family{};                  ///< Main graphics queue family index.
     u32 present_family{};                   ///< Main present queue family index.
     VkDriverIdKHR driver_id{};              ///< Driver ID.
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
index d7f65d435..5babbdd0b 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -29,8 +29,8 @@ void InnerFence::Queue() {
     }
     ASSERT(!event);
 
-    event = device.GetLogical().CreateNewEvent();
-    ticks = scheduler.Ticks();
+    event = device.GetLogical().CreateEvent();
+    ticks = scheduler.CurrentTick();
 
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([event = *event](vk::CommandBuffer cmdbuf) {
@@ -52,7 +52,7 @@ void InnerFence::Wait() {
     }
     ASSERT(event);
 
-    if (ticks >= scheduler.Ticks()) {
+    if (ticks >= scheduler.CurrentTick()) {
         scheduler.Flush();
     }
     while (!IsEventSignalled()) {
@@ -71,12 +71,12 @@ bool InnerFence::IsEventSignalled() const {
     }
 }
 
-VKFenceManager::VKFenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                               const VKDevice& device, VKScheduler& scheduler,
-                               VKTextureCache& texture_cache, VKBufferCache& buffer_cache,
-                               VKQueryCache& query_cache)
-    : GenericFenceManager(system, rasterizer, texture_cache, buffer_cache, query_cache),
-      device{device}, scheduler{scheduler} {}
+VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                               Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache,
+                               VKBufferCache& buffer_cache, VKQueryCache& query_cache,
+                               const VKDevice& device_, VKScheduler& scheduler_)
+    : GenericFenceManager(rasterizer, gpu, texture_cache, buffer_cache, query_cache),
+      device{device_}, scheduler{scheduler_} {}
 
 Fence VKFenceManager::CreateFence(u32 value, bool is_stubbed) {
     return std::make_shared<InnerFence>(device, scheduler, value, is_stubbed);
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 043fe7947..1547d6d30 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -55,10 +55,10 @@ using GenericFenceManager =
 
 class VKFenceManager final : public GenericFenceManager {
 public:
-    explicit VKFenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                            const VKDevice& device, VKScheduler& scheduler,
-                            VKTextureCache& texture_cache, VKBufferCache& buffer_cache,
-                            VKQueryCache& query_cache);
+    explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                            Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache,
+                            VKBufferCache& buffer_cache, VKQueryCache& query_cache,
+                            const VKDevice& device, VKScheduler& scheduler);
 
 protected:
     Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 2e46c6278..0e8f9c352 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -93,7 +93,7 @@ VkDescriptorSet VKGraphicsPipeline::CommitDescriptorSet() {
     if (!descriptor_template) {
         return {};
     }
-    const auto set = descriptor_allocator.Commit(scheduler.GetFence());
+    const VkDescriptorSet set = descriptor_allocator.Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
@@ -159,6 +159,7 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
         .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0,
+        .codeSize = 0,
     };
 
     std::vector<vk::ShaderModule> modules;
@@ -261,12 +262,12 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         vertex_input_ci.pNext = &input_divisor_ci;
     }
 
-    const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology());
+    const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, state.topology);
     const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0,
-        .topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology()),
+        .topology = MaxwellToVK::PrimitiveTopology(device, state.topology),
         .primitiveRestartEnable = state.primitive_restart_enable != 0 &&
                                   SupportsPrimitiveRestart(input_assembly_topology),
     };
@@ -388,6 +389,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         .logicOp = VK_LOGIC_OP_COPY,
         .attachmentCount = static_cast<u32>(num_attachments),
         .pAttachments = cb_attachments.data(),
+        .blendConstants = {},
     };
 
     std::vector dynamic_states{
@@ -400,7 +402,6 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         static constexpr std::array extended{
             VK_DYNAMIC_STATE_CULL_MODE_EXT,
             VK_DYNAMIC_STATE_FRONT_FACE_EXT,
-            VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT,
             VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT,
             VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT,
             VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
new file mode 100644
index 000000000..ae26e558d
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@@ -0,0 +1,56 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <atomic>
+#include <chrono>
+
+#include "core/settings.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+using namespace std::chrono_literals;
+
+MasterSemaphore::MasterSemaphore(const VKDevice& device) {
+    static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR,
+        .initialValue = 0,
+    };
+    static constexpr VkSemaphoreCreateInfo semaphore_ci{
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &semaphore_type_ci,
+        .flags = 0,
+    };
+    semaphore = device.GetLogical().CreateSemaphore(semaphore_ci);
+
+    if (!Settings::values.renderer_debug) {
+        return;
+    }
+    // Validation layers have a bug where they fail to track resource usage when using timeline
+    // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have
+    // a separate thread waiting for each timeline semaphore value.
+    debug_thread = std::thread([this] {
+        u64 counter = 0;
+        while (!shutdown) {
+            if (semaphore.Wait(counter, 10'000'000)) {
+                ++counter;
+            }
+        }
+    });
+}
+
+MasterSemaphore::~MasterSemaphore() {
+    shutdown = true;
+
+    // This thread might not be started
+    if (debug_thread.joinable()) {
+        debug_thread.join();
+    }
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
new file mode 100644
index 000000000..0e93706d7
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <thread>
+
+#include "common/common_types.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+class VKDevice;
+
+class MasterSemaphore {
+public:
+    explicit MasterSemaphore(const VKDevice& device);
+    ~MasterSemaphore();
+
+    /// Returns the current logical tick.
+    [[nodiscard]] u64 CurrentTick() const noexcept {
+        return current_tick;
+    }
+
+    /// Returns the timeline semaphore handle.
+    [[nodiscard]] VkSemaphore Handle() const noexcept {
+        return *semaphore;
+    }
+
+    /// Returns true when a tick has been hit by the GPU.
+    [[nodiscard]] bool IsFree(u64 tick) {
+        return gpu_tick >= tick;
+    }
+
+    /// Advance to the logical tick.
+    void NextTick() noexcept {
+        ++current_tick;
+    }
+
+    /// Refresh the known GPU tick
+    void Refresh() {
+        gpu_tick = semaphore.GetCounter();
+    }
+
+    /// Waits for a tick to be hit on the GPU
+    void Wait(u64 tick) {
+        // No need to wait if the GPU is ahead of the tick
+        if (IsFree(tick)) {
+            return;
+        }
+        // Update the GPU tick and try again
+        Refresh();
+        if (IsFree(tick)) {
+            return;
+        }
+        // If none of the above is hit, fallback to a regular wait
+        semaphore.Wait(tick);
+    }
+
+private:
+    vk::Semaphore semaphore;           ///< Timeline semaphore.
+    std::atomic<u64> gpu_tick{0};      ///< Current known GPU tick.
+    std::atomic<u64> current_tick{1};  ///< Current logical tick.
+    std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed.
+    std::thread debug_thread;          ///< Debug thread to workaround validation layer bugs.
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index cfdcdd6ab..dedc9c466 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -135,64 +135,56 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con
     return std::memcmp(&rhs, this, sizeof *this) == 0;
 }
 
-Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
-               VideoCommon::Shader::ProgramCode program_code, u32 main_offset)
-    : gpu_addr{gpu_addr}, program_code{std::move(program_code)},
-      registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
-                                                           compiler_settings, registry},
-      entries{GenerateShaderEntries(shader_ir)} {}
+Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine, Tegra::Engines::ShaderType stage,
+               GPUVAddr gpu_addr_, VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code_,
+               u32 main_offset)
+    : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage, engine),
+      shader_ir(program_code, main_offset, compiler_settings, registry),
+      entries(GenerateShaderEntries(shader_ir)) {}
 
 Shader::~Shader() = default;
 
-Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system,
-                                                              Tegra::Engines::ShaderType stage) {
-    if (stage == ShaderType::Compute) {
-        return system.GPU().KeplerCompute();
-    } else {
-        return system.GPU().Maxwell3D();
-    }
-}
-
-VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
-                                 const VKDevice& device, VKScheduler& scheduler,
-                                 VKDescriptorPool& descriptor_pool,
-                                 VKUpdateDescriptorQueue& update_descriptor_queue,
-                                 VKRenderPassCache& renderpass_cache)
-    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device},
-      scheduler{scheduler}, descriptor_pool{descriptor_pool},
-      update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}
+VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu_,
+                                 Tegra::Engines::Maxwell3D& maxwell3d_,
+                                 Tegra::Engines::KeplerCompute& kepler_compute_,
+                                 Tegra::MemoryManager& gpu_memory_, const VKDevice& device_,
+                                 VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_,
+                                 VKUpdateDescriptorQueue& update_descriptor_queue_,
+                                 VKRenderPassCache& renderpass_cache_)
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, gpu{gpu_}, maxwell3d{maxwell3d_},
+      kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, device{device_},
+      scheduler{scheduler_}, descriptor_pool{descriptor_pool_},
+      update_descriptor_queue{update_descriptor_queue_}, renderpass_cache{renderpass_cache_} {}
 
 VKPipelineCache::~VKPipelineCache() = default;
 
 std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
-    const auto& gpu = system.GPU().Maxwell3D();
-
     std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
+
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             continue;
         }
 
-        auto& memory_manager{system.GPU().MemoryManager()};
-        const GPUVAddr program_addr{GetShaderAddress(system, program)};
-        const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
+        const GPUVAddr gpu_addr{GetShaderAddress(maxwell3d, program)};
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         ASSERT(cpu_addr);
 
         Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
         if (!result) {
-            const auto host_ptr{memory_manager.GetPointer(program_addr)};
+            const u8* const host_ptr{gpu_memory.GetPointer(gpu_addr)};
 
             // No shader found - create a new one
-            constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
+            static constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
             const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
-            ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
+            ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, false);
             const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
-            auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code),
-                                                   stage_offset);
+            auto shader = std::make_unique<Shader>(maxwell3d, stage, gpu_addr, *cpu_addr,
+                                                   std::move(code), stage_offset);
             result = shader.get();
 
             if (cpu_addr) {
@@ -215,11 +207,11 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline(
     }
     last_graphics_key = key;
 
-    if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(system.GPU())) {
+    if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(gpu)) {
         std::unique_lock lock{pipeline_cache};
         const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key);
         if (is_cache_miss) {
-            system.GPU().ShaderNotify().MarkSharderBuilding();
+            gpu.ShaderNotify().MarkSharderBuilding();
             LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
             const auto [program, bindings] = DecompileShaders(key.fixed_state);
             async_shaders.QueueVulkanShader(this, device, scheduler, descriptor_pool,
@@ -233,13 +225,13 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline(
     const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key);
     auto& entry = pair->second;
     if (is_cache_miss) {
-        system.GPU().ShaderNotify().MarkSharderBuilding();
+        gpu.ShaderNotify().MarkSharderBuilding();
         LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
         const auto [program, bindings] = DecompileShaders(key.fixed_state);
         entry = std::make_unique<VKGraphicsPipeline>(device, scheduler, descriptor_pool,
                                                      update_descriptor_queue, renderpass_cache, key,
                                                      bindings, program);
-        system.GPU().ShaderNotify().MarkShaderComplete();
+        gpu.ShaderNotify().MarkShaderComplete();
     }
     last_graphics_pipeline = entry.get();
     return last_graphics_pipeline;
@@ -255,22 +247,21 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     }
     LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
 
-    auto& memory_manager = system.GPU().MemoryManager();
-    const auto program_addr = key.shader;
+    const GPUVAddr gpu_addr = key.shader;
 
-    const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
     ASSERT(cpu_addr);
 
     Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
     if (!shader) {
         // No shader found - create a new one
-        const auto host_ptr = memory_manager.GetPointer(program_addr);
+        const auto host_ptr = gpu_memory.GetPointer(gpu_addr);
 
-        ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
+        ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, true);
         const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
-        auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr,
-                                                    std::move(code), KERNEL_MAIN_OFFSET);
+        auto shader_info = std::make_unique<Shader>(kepler_compute, ShaderType::Compute, gpu_addr,
+                                                    *cpu_addr, std::move(code), KERNEL_MAIN_OFFSET);
         shader = shader_info.get();
 
         if (cpu_addr) {
@@ -298,7 +289,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
 }
 
 void VKPipelineCache::EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline) {
-    system.GPU().ShaderNotify().MarkShaderComplete();
+    gpu.ShaderNotify().MarkShaderComplete();
     std::unique_lock lock{pipeline_cache};
     graphics_cache.at(pipeline->GetCacheKey()) = std::move(pipeline);
 }
@@ -339,12 +330,8 @@ void VKPipelineCache::OnShaderRemoval(Shader* shader) {
 
 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
 VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) {
-    auto& memory_manager = system.GPU().MemoryManager();
-    const auto& gpu = system.GPU().Maxwell3D();
-
     Specialization specialization;
-    if (fixed_state.dynamic_state.Topology() == Maxwell::PrimitiveTopology::Points ||
-        device.IsExtExtendedDynamicStateSupported()) {
+    if (fixed_state.topology == Maxwell::PrimitiveTopology::Points) {
         float point_size;
         std::memcpy(&point_size, &fixed_state.point_size, sizeof(float));
         specialization.point_size = point_size;
@@ -364,12 +351,12 @@ VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) {
         const auto program_enum = static_cast<Maxwell::ShaderProgram>(index);
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             continue;
         }
 
-        const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
-        const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        const GPUVAddr gpu_addr = GetShaderAddress(maxwell3d, program_enum);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
 
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index c04829e77..e558e6658 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -38,7 +38,6 @@ class RasterizerVulkan;
 class VKComputePipeline;
 class VKDescriptorPool;
 class VKDevice;
-class VKFence;
 class VKScheduler;
 class VKUpdateDescriptorQueue;
 
@@ -85,7 +84,8 @@ namespace Vulkan {
 
 class Shader {
 public:
-    explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+    explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine,
+                    Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, VAddr cpu_addr,
                     VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
     ~Shader();
 
@@ -97,22 +97,19 @@ public:
         return shader_ir;
     }
 
-    const VideoCommon::Shader::Registry& GetRegistry() const {
-        return registry;
-    }
-
     const VideoCommon::Shader::ShaderIR& GetIR() const {
         return shader_ir;
     }
 
+    const VideoCommon::Shader::Registry& GetRegistry() const {
+        return registry;
+    }
+
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
 private:
-    static Tegra::Engines::ConstBufferEngineInterface& GetEngine(Core::System& system,
-                                                                 Tegra::Engines::ShaderType stage);
-
     GPUVAddr gpu_addr{};
     VideoCommon::Shader::ProgramCode program_code;
     VideoCommon::Shader::Registry registry;
@@ -122,9 +119,11 @@ private:
 
 class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
 public:
-    explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
-                             const VKDevice& device, VKScheduler& scheduler,
-                             VKDescriptorPool& descriptor_pool,
+    explicit VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu,
+                             Tegra::Engines::Maxwell3D& maxwell3d,
+                             Tegra::Engines::KeplerCompute& kepler_compute,
+                             Tegra::MemoryManager& gpu_memory, const VKDevice& device,
+                             VKScheduler& scheduler, VKDescriptorPool& descriptor_pool,
                              VKUpdateDescriptorQueue& update_descriptor_queue,
                              VKRenderPassCache& renderpass_cache);
     ~VKPipelineCache() override;
@@ -145,7 +144,11 @@ private:
     std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
         const FixedPipelineState& fixed_state);
 
-    Core::System& system;
+    Tegra::GPU& gpu;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
+
     const VKDevice& device;
     VKScheduler& scheduler;
     VKDescriptorPool& descriptor_pool;
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index 6cd63d090..ee2d871e3 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -9,35 +9,33 @@
 
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
+using VideoCore::QueryType;
+
 namespace {
 
 constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION};
 
-constexpr VkQueryType GetTarget(VideoCore::QueryType type) {
+constexpr VkQueryType GetTarget(QueryType type) {
     return QUERY_TARGETS[static_cast<std::size_t>(type)];
 }
 
 } // Anonymous namespace
 
-QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+QueryPool::QueryPool(const VKDevice& device_, VKScheduler& scheduler, QueryType type_)
+    : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {}
 
 QueryPool::~QueryPool() = default;
 
-void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
-    device = &device_;
-    type = type_;
-}
-
-std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) {
+std::pair<VkQueryPool, u32> QueryPool::Commit() {
     std::size_t index;
     do {
-        index = CommitResource(fence);
+        index = CommitResource();
     } while (usage[index]);
     usage[index] = true;
 
@@ -47,7 +45,7 @@ std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) {
 void QueryPool::Allocate(std::size_t begin, std::size_t end) {
     usage.resize(end);
 
-    pools.push_back(device->GetLogical().CreateQueryPool({
+    pools.push_back(device.GetLogical().CreateQueryPool({
         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0,
@@ -68,30 +66,39 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) {
     usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
 }
 
-VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer,
+                           Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
                            const VKDevice& device, VKScheduler& scheduler)
-    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
-                                  QueryPool>{system, rasterizer},
-      device{device}, scheduler{scheduler} {
-    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
-        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream,
+                                  HostCounter>{rasterizer, maxwell3d, gpu_memory},
+      device{device}, scheduler{scheduler}, query_pools{
+                                                QueryPool{device, scheduler,
+                                                          QueryType::SamplesPassed},
+                                            } {}
+
+VKQueryCache::~VKQueryCache() {
+    // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class
+    // destructor is called. The query cache should be redesigned to have a proper ownership model
+    // instead of using shared pointers.
+    for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) {
+        auto& stream = Stream(static_cast<QueryType>(query_type));
+        stream.Update(false);
+        stream.Reset();
     }
 }
 
-VKQueryCache::~VKQueryCache() = default;
-
-std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
-    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit();
 }
 
-void VKQueryCache::Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query) {
+void VKQueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) {
     query_pools[static_cast<std::size_t>(type)].Reserve(query);
 }
 
 HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
-                         VideoCore::QueryType type)
+                         QueryType type)
     : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
-      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+      type{type}, query{cache.AllocateQuery(type)}, tick{cache.Scheduler().CurrentTick()} {
     const vk::Device* logical = &cache.Device().GetLogical();
     cache.Scheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) {
         logical->ResetQueryPoolEXT(query.first, query.second, 1);
@@ -109,7 +116,7 @@ void HostCounter::EndQuery() {
 }
 
 u64 HostCounter::BlockingQuery() const {
-    if (ticks >= cache.Scheduler().Ticks()) {
+    if (tick >= cache.Scheduler().CurrentTick()) {
         cache.Scheduler().Flush();
     }
     u64 data;
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
index 40119e6d3..2e57fb75d 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.h
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -11,7 +11,7 @@
 
 #include "common/common_types.h"
 #include "video_core/query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace VideoCore {
@@ -28,14 +28,12 @@ class VKScheduler;
 
 using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
 
-class QueryPool final : public VKFencedPool {
+class QueryPool final : public ResourcePool {
 public:
-    explicit QueryPool();
+    explicit QueryPool(const VKDevice& device, VKScheduler& scheduler, VideoCore::QueryType type);
     ~QueryPool() override;
 
-    void Initialize(const VKDevice& device, VideoCore::QueryType type);
-
-    std::pair<VkQueryPool, u32> Commit(VKFence& fence);
+    std::pair<VkQueryPool, u32> Commit();
 
     void Reserve(std::pair<VkQueryPool, u32> query);
 
@@ -45,18 +43,18 @@ protected:
 private:
     static constexpr std::size_t GROW_STEP = 512;
 
-    const VKDevice* device = nullptr;
-    VideoCore::QueryType type = {};
+    const VKDevice& device;
+    const VideoCore::QueryType type;
 
     std::vector<vk::QueryPool> pools;
     std::vector<bool> usage;
 };
 
 class VKQueryCache final
-    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
-                                         QueryPool> {
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter> {
 public:
-    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+    explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer,
+                          Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
                           const VKDevice& device, VKScheduler& scheduler);
     ~VKQueryCache();
 
@@ -75,6 +73,7 @@ public:
 private:
     const VKDevice& device;
     VKScheduler& scheduler;
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
 };
 
 class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
@@ -91,7 +90,7 @@ private:
     VKQueryCache& cache;
     const VideoCore::QueryType type;
     const std::pair<VkQueryPool, u32> query;
-    const u64 ticks;
+    const u64 tick;
 };
 
 class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 936f76195..e0fb8693f 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -31,7 +31,6 @@
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -381,28 +380,28 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const {
     }
 }
 
-RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer,
-                                   VKScreenInfo& screen_info, const VKDevice& device,
-                                   VKResourceManager& resource_manager,
-                                   VKMemoryManager& memory_manager, StateTracker& state_tracker,
-                                   VKScheduler& scheduler)
-    : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer},
-      screen_info{screen_info}, device{device}, resource_manager{resource_manager},
-      memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler},
-      staging_pool(device, memory_manager, scheduler), descriptor_pool(device),
-      update_descriptor_queue(device, scheduler), renderpass_cache(device),
+RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
+                                   Tegra::MemoryManager& gpu_memory_,
+                                   Core::Memory::Memory& cpu_memory, VKScreenInfo& screen_info_,
+                                   const VKDevice& device_, VKMemoryManager& memory_manager_,
+                                   StateTracker& state_tracker_, VKScheduler& scheduler_)
+    : RasterizerAccelerated(cpu_memory), gpu(gpu_), gpu_memory(gpu_memory_),
+      maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), screen_info(screen_info_),
+      device(device_), memory_manager(memory_manager_), state_tracker(state_tracker_),
+      scheduler(scheduler_), staging_pool(device, memory_manager, scheduler),
+      descriptor_pool(device, scheduler_), update_descriptor_queue(device, scheduler),
+      renderpass_cache(device),
       quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
       quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
       uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
-      texture_cache(system, *this, device, resource_manager, memory_manager, scheduler,
-                    staging_pool),
-      pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue,
-                     renderpass_cache),
-      buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device),
-      fence_manager(system, *this, device, scheduler, texture_cache, buffer_cache, query_cache),
-      query_cache(system, *this, device, scheduler),
-      wfi_event{device.GetLogical().CreateNewEvent()}, async_shaders{renderer} {
+      texture_cache(*this, maxwell3d, gpu_memory, device, memory_manager, scheduler, staging_pool),
+      pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
+                     descriptor_pool, update_descriptor_queue, renderpass_cache),
+      buffer_cache(*this, gpu_memory, cpu_memory, device, memory_manager, scheduler, staging_pool),
+      sampler_cache(device), query_cache(*this, maxwell3d, gpu_memory, device, scheduler),
+      fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, device,
+                    scheduler),
+      wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window) {
     scheduler.SetQueryCache(query_cache);
     if (device.UseAsynchronousShaders()) {
         async_shaders.AllocateWorkers();
@@ -414,15 +413,13 @@ RasterizerVulkan::~RasterizerVulkan() = default;
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
+    SCOPE_EXIT({ gpu.TickWork(); });
     FlushWork();
 
     query_cache.UpdateCounters();
 
-    SCOPE_EXIT({ system.GPU().TickWork(); });
-
-    const auto& gpu = system.GPU().Maxwell3D();
     GraphicsPipelineCacheKey key;
-    key.fixed_state.Fill(gpu.regs, device.IsExtExtendedDynamicStateSupported());
+    key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported());
 
     buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed));
 
@@ -480,8 +477,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
-    const auto& gpu = system.GPU().Maxwell3D();
-    if (!system.GPU().Maxwell3D().ShouldExecute()) {
+    if (!maxwell3d.ShouldExecute()) {
         return;
     }
 
@@ -490,7 +486,7 @@ void RasterizerVulkan::Clear() {
 
     query_cache.UpdateCounters();
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
                            regs.clear_buffers.A;
     const bool use_depth = regs.clear_buffers.Z;
@@ -559,7 +555,7 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
 
     query_cache.UpdateCounters();
 
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
     auto& pipeline = pipeline_cache.GetComputePipeline({
         .shader = code_addr,
         .shared_memory_size = launch_desc.shared_alloc,
@@ -655,16 +651,14 @@ void RasterizerVulkan::SyncGuestHost() {
 }
 
 void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
-        gpu.MemoryManager().Write<u32>(addr, value);
+        gpu_memory.Write<u32>(addr, value);
         return;
     }
     fence_manager.SignalSemaphore(addr, value);
 }
 
 void RasterizerVulkan::SignalSyncPoint(u32 value) {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
         gpu.IncrementSyncPoint(value);
         return;
@@ -673,7 +667,6 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) {
 }
 
 void RasterizerVulkan::ReleaseFences() {
-    auto& gpu{system.GPU()};
     if (!gpu.IsAsync()) {
         return;
     }
@@ -751,10 +744,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerVulkan::SetupDirtyFlags() {
-    state_tracker.Initialize();
-}
-
 void RasterizerVulkan::FlushWork() {
     static constexpr u32 DRAWS_TO_DISPATCH = 4096;
 
@@ -778,10 +767,9 @@ void RasterizerVulkan::FlushWork() {
 
 RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& maxwell3d = system.GPU().Maxwell3D();
-    auto& dirty = maxwell3d.dirty.flags;
-    auto& regs = maxwell3d.regs;
 
+    const auto& regs = maxwell3d.regs;
+    auto& dirty = maxwell3d.dirty.flags;
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -844,7 +832,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
         return true;
     };
 
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count);
     for (std::size_t index = 0; index < num_attachments; ++index) {
         if (try_push(color_attachments[index])) {
@@ -880,13 +868,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
                                                                  bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Geometry);
 
-    const auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
 
     SetupVertexArrays(buffer_bindings);
 
     const u32 base_instance = regs.vb_base_instance;
-    const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1;
+    const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1;
     const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first;
     const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count;
 
@@ -947,7 +934,7 @@ void RasterizerVulkan::SetupImageTransitions(
 }
 
 void RasterizerVulkan::UpdateDynamicStates() {
-    auto& regs = system.GPU().Maxwell3D().regs;
+    auto& regs = maxwell3d.regs;
     UpdateViewportsState(regs);
     UpdateScissorsState(regs);
     UpdateDepthBias(regs);
@@ -961,14 +948,13 @@ void RasterizerVulkan::UpdateDynamicStates() {
         UpdateDepthWriteEnable(regs);
         UpdateDepthCompareOp(regs);
         UpdateFrontFace(regs);
-        UpdatePrimitiveTopology(regs);
         UpdateStencilOp(regs);
         UpdateStencilTestEnable(regs);
     }
 }
 
 void RasterizerVulkan::BeginTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
@@ -1000,7 +986,7 @@ void RasterizerVulkan::BeginTransformFeedback() {
 }
 
 void RasterizerVulkan::EndTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
@@ -1013,7 +999,7 @@ void RasterizerVulkan::EndTransformFeedback() {
 }
 
 void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
         const auto& vertex_array = regs.vertex_array[index];
@@ -1039,7 +1025,7 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
     if (params.num_vertices == 0) {
         return;
     }
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     switch (regs.draw.topology) {
     case Maxwell::PrimitiveTopology::Quads: {
         if (!params.is_indexed) {
@@ -1087,8 +1073,7 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
 
 void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
-    const auto& gpu = system.GPU().Maxwell3D();
-    const auto& shader_stage = gpu.state.shader_stages[stage];
+    const auto& shader_stage = maxwell3d.state.shader_stages[stage];
     for (const auto& entry : entries.const_buffers) {
         SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]);
     }
@@ -1096,8 +1081,7 @@ void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, s
 
 void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
-    auto& gpu{system.GPU()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage]};
+    const auto& cbufs{maxwell3d.state.shader_stages[stage]};
 
     for (const auto& entry : entries.global_buffers) {
         const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset();
@@ -1107,19 +1091,17 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
 
 void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.uniform_texels) {
-        const auto image = GetTextureInfo(gpu, entry, stage).tic;
+        const auto image = GetTextureInfo(maxwell3d, entry, stage).tic;
         SetupUniformTexels(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.samplers) {
         for (std::size_t i = 0; i < entry.size; ++i) {
-            const auto texture = GetTextureInfo(gpu, entry, stage, i);
+            const auto texture = GetTextureInfo(maxwell3d, entry, stage, i);
             SetupTexture(texture, entry);
         }
     }
@@ -1127,25 +1109,23 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
 
 void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.storage_texels) {
-        const auto image = GetTextureInfo(gpu, entry, stage).tic;
+        const auto image = GetTextureInfo(maxwell3d, entry, stage).tic;
         SetupStorageTexel(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.images) {
-        const auto tic = GetTextureInfo(gpu, entry, stage).tic;
+        const auto tic = GetTextureInfo(maxwell3d, entry, stage).tic;
         SetupImage(tic, entry);
     }
 }
 
 void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
     for (const auto& entry : entries.const_buffers) {
         const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
         const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
@@ -1159,7 +1139,7 @@ void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
 
 void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
-    const auto cbufs{system.GPU().KeplerCompute().launch_description.const_buffer_config};
+    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
     for (const auto& entry : entries.global_buffers) {
         const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
         SetupGlobalBuffer(entry, addr);
@@ -1168,19 +1148,17 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
 
 void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.uniform_texels) {
-        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
         SetupUniformTexels(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.samplers) {
         for (std::size_t i = 0; i < entry.size; ++i) {
-            const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i);
+            const auto texture = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex, i);
             SetupTexture(texture, entry);
         }
     }
@@ -1188,18 +1166,16 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
 
 void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.storage_texels) {
-        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
         SetupStorageTexel(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.images) {
-        const auto tic = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        const auto tic = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
         SetupImage(tic, entry);
     }
 }
@@ -1223,9 +1199,8 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const auto actual_addr = memory_manager.Read<u64>(address);
-    const auto size = memory_manager.Read<u32>(address + 8);
+    const u64 actual_addr = gpu_memory.Read<u64>(address);
+    const u32 size = gpu_memory.Read<u32>(address + 8);
 
     if (size == 0) {
         // Sometimes global memory pointers don't have a proper size. Upload a dummy entry
@@ -1442,16 +1417,6 @@ void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) {
         [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); });
 }
 
-void RasterizerVulkan::UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs) {
-    if (!state_tracker.TouchPrimitiveTopology()) {
-        return;
-    }
-    const Maxwell::PrimitiveTopology primitive_topology = regs.draw.topology.Value();
-    scheduler.Record([this, primitive_topology](vk::CommandBuffer cmdbuf) {
-        cmdbuf.SetPrimitiveTopologyEXT(MaxwellToVK::PrimitiveTopology(device, primitive_topology));
-    });
-}
-
 void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
     if (!state_tracker.TouchStencilOp()) {
         return;
@@ -1508,7 +1473,7 @@ std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const {
 }
 
 std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -1523,9 +1488,8 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
 }
 
 std::size_t RasterizerVulkan::CalculateIndexBufferSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    return static_cast<std::size_t>(regs.index_array.count) *
-           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+    return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
+           static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
 }
 
 std::size_t RasterizerVulkan::CalculateConstBufferSize(
@@ -1540,7 +1504,7 @@ std::size_t RasterizerVulkan::CalculateConstBufferSize(
 }
 
 RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count);
 
     RenderPassParams params;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index f640ba649..237e51fa4 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -25,7 +25,6 @@
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -106,10 +105,11 @@ struct ImageView {
 
 class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
-    explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
+    explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+                              Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                               VKScreenInfo& screen_info, const VKDevice& device,
-                              VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                              StateTracker& state_tracker, VKScheduler& scheduler);
+                              VKMemoryManager& memory_manager, StateTracker& state_tracker,
+                              VKScheduler& scheduler);
     ~RasterizerVulkan() override;
 
     void Draw(bool is_indexed, bool is_instanced) override;
@@ -135,7 +135,6 @@ public:
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
-    void SetupDirtyFlags() override;
 
     VideoCommon::Shader::AsyncShaders& GetAsyncShaders() {
         return async_shaders;
@@ -260,7 +259,6 @@ private:
     void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs);
-    void UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
 
@@ -279,11 +277,13 @@ private:
 
     VkBuffer DefaultBuffer();
 
-    Core::System& system;
-    Core::Frontend::EmuWindow& render_window;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+
     VKScreenInfo& screen_info;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     StateTracker& state_tracker;
     VKScheduler& scheduler;
@@ -300,8 +300,8 @@ private:
     VKPipelineCache pipeline_cache;
     VKBufferCache buffer_cache;
     VKSamplerCache sampler_cache;
-    VKFenceManager fence_manager;
     VKQueryCache query_cache;
+    VKFenceManager fence_manager;
 
     vk::Buffer default_buffer;
     VKMemoryCommit default_buffer_commit;
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
deleted file mode 100644
index f19330a36..000000000
--- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <optional>
-#include "common/assert.h"
-#include "common/logging/log.h"
-#include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
-#include "video_core/renderer_vulkan/wrapper.h"
-
-namespace Vulkan {
-
-namespace {
-
-// TODO(Rodrigo): Fine tune these numbers.
-constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 0x1000;
-constexpr std::size_t FENCES_GROW_STEP = 0x40;
-
-constexpr VkFenceCreateInfo BuildFenceCreateInfo() {
-    return {
-        .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
-        .pNext = nullptr,
-        .flags = 0,
-    };
-}
-
-} // Anonymous namespace
-
-class CommandBufferPool final : public VKFencedPool {
-public:
-    explicit CommandBufferPool(const VKDevice& device)
-        : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {}
-
-    void Allocate(std::size_t begin, std::size_t end) override {
-        // Command buffers are going to be commited, recorded, executed every single usage cycle.
-        // They are also going to be reseted when commited.
-        Pool& pool = pools.emplace_back();
-        pool.handle = device.GetLogical().CreateCommandPool({
-            .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
-            .pNext = nullptr,
-            .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
-                     VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-            .queueFamilyIndex = device.GetGraphicsFamily(),
-        });
-        pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE);
-    }
-
-    VkCommandBuffer Commit(VKFence& fence) {
-        const std::size_t index = CommitResource(fence);
-        const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE;
-        const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE;
-        return pools[pool_index].cmdbufs[sub_index];
-    }
-
-private:
-    struct Pool {
-        vk::CommandPool handle;
-        vk::CommandBuffers cmdbufs;
-    };
-
-    const VKDevice& device;
-    std::vector<Pool> pools;
-};
-
-VKResource::VKResource() = default;
-
-VKResource::~VKResource() = default;
-
-VKFence::VKFence(const VKDevice& device)
-    : device{device}, handle{device.GetLogical().CreateFence(BuildFenceCreateInfo())} {}
-
-VKFence::~VKFence() = default;
-
-void VKFence::Wait() {
-    switch (const VkResult result = handle.Wait()) {
-    case VK_SUCCESS:
-        return;
-    case VK_ERROR_DEVICE_LOST:
-        device.ReportLoss();
-        [[fallthrough]];
-    default:
-        throw vk::Exception(result);
-    }
-}
-
-void VKFence::Release() {
-    ASSERT(is_owned);
-    is_owned = false;
-}
-
-void VKFence::Commit() {
-    is_owned = true;
-    is_used = true;
-}
-
-bool VKFence::Tick(bool gpu_wait, bool owner_wait) {
-    if (!is_used) {
-        // If a fence is not used it's always free.
-        return true;
-    }
-    if (is_owned && !owner_wait) {
-        // The fence is still being owned (Release has not been called) and ownership wait has
-        // not been asked.
-        return false;
-    }
-
-    if (gpu_wait) {
-        // Wait for the fence if it has been requested.
-        (void)handle.Wait();
-    } else {
-        if (handle.GetStatus() != VK_SUCCESS) {
-            // Vulkan fence is not ready, not much it can do here
-            return false;
-        }
-    }
-
-    // Broadcast resources their free state.
-    for (auto* resource : protected_resources) {
-        resource->OnFenceRemoval(this);
-    }
-    protected_resources.clear();
-
-    // Prepare fence for reusage.
-    handle.Reset();
-    is_used = false;
-    return true;
-}
-
-void VKFence::Protect(VKResource* resource) {
-    protected_resources.push_back(resource);
-}
-
-void VKFence::Unprotect(VKResource* resource) {
-    const auto it = std::find(protected_resources.begin(), protected_resources.end(), resource);
-    ASSERT(it != protected_resources.end());
-
-    resource->OnFenceRemoval(this);
-    protected_resources.erase(it);
-}
-
-void VKFence::RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept {
-    std::replace(std::begin(protected_resources), std::end(protected_resources), old_resource,
-                 new_resource);
-}
-
-VKFenceWatch::VKFenceWatch() = default;
-
-VKFenceWatch::VKFenceWatch(VKFence& initial_fence) {
-    Watch(initial_fence);
-}
-
-VKFenceWatch::VKFenceWatch(VKFenceWatch&& rhs) noexcept {
-    fence = std::exchange(rhs.fence, nullptr);
-    if (fence) {
-        fence->RedirectProtection(&rhs, this);
-    }
-}
-
-VKFenceWatch& VKFenceWatch::operator=(VKFenceWatch&& rhs) noexcept {
-    fence = std::exchange(rhs.fence, nullptr);
-    if (fence) {
-        fence->RedirectProtection(&rhs, this);
-    }
-    return *this;
-}
-
-VKFenceWatch::~VKFenceWatch() {
-    if (fence) {
-        fence->Unprotect(this);
-    }
-}
-
-void VKFenceWatch::Wait() {
-    if (fence == nullptr) {
-        return;
-    }
-    fence->Wait();
-    fence->Unprotect(this);
-}
-
-void VKFenceWatch::Watch(VKFence& new_fence) {
-    Wait();
-    fence = &new_fence;
-    fence->Protect(this);
-}
-
-bool VKFenceWatch::TryWatch(VKFence& new_fence) {
-    if (fence) {
-        return false;
-    }
-    fence = &new_fence;
-    fence->Protect(this);
-    return true;
-}
-
-void VKFenceWatch::OnFenceRemoval(VKFence* signaling_fence) {
-    ASSERT_MSG(signaling_fence == fence, "Removing the wrong fence");
-    fence = nullptr;
-}
-
-VKFencedPool::VKFencedPool(std::size_t grow_step) : grow_step{grow_step} {}
-
-VKFencedPool::~VKFencedPool() = default;
-
-std::size_t VKFencedPool::CommitResource(VKFence& fence) {
-    const auto Search = [&](std::size_t begin, std::size_t end) -> std::optional<std::size_t> {
-        for (std::size_t iterator = begin; iterator < end; ++iterator) {
-            if (watches[iterator]->TryWatch(fence)) {
-                // The resource is now being watched, a free resource was successfully found.
-                return iterator;
-            }
-        }
-        return {};
-    };
-    // Try to find a free resource from the hinted position to the end.
-    auto found = Search(free_iterator, watches.size());
-    if (!found) {
-        // Search from beginning to the hinted position.
-        found = Search(0, free_iterator);
-        if (!found) {
-            // Both searches failed, the pool is full; handle it.
-            const std::size_t free_resource = ManageOverflow();
-
-            // Watch will wait for the resource to be free.
-            watches[free_resource]->Watch(fence);
-            found = free_resource;
-        }
-    }
-    // Free iterator is hinted to the resource after the one that's been commited.
-    free_iterator = (*found + 1) % watches.size();
-    return *found;
-}
-
-std::size_t VKFencedPool::ManageOverflow() {
-    const std::size_t old_capacity = watches.size();
-    Grow();
-
-    // The last entry is guaranted to be free, since it's the first element of the freshly
-    // allocated resources.
-    return old_capacity;
-}
-
-void VKFencedPool::Grow() {
-    const std::size_t old_capacity = watches.size();
-    watches.resize(old_capacity + grow_step);
-    std::generate(watches.begin() + old_capacity, watches.end(),
-                  []() { return std::make_unique<VKFenceWatch>(); });
-    Allocate(old_capacity, old_capacity + grow_step);
-}
-
-VKResourceManager::VKResourceManager(const VKDevice& device) : device{device} {
-    GrowFences(FENCES_GROW_STEP);
-    command_buffer_pool = std::make_unique<CommandBufferPool>(device);
-}
-
-VKResourceManager::~VKResourceManager() = default;
-
-VKFence& VKResourceManager::CommitFence() {
-    const auto StepFences = [&](bool gpu_wait, bool owner_wait) -> VKFence* {
-        const auto Tick = [=](auto& fence) { return fence->Tick(gpu_wait, owner_wait); };
-        const auto hinted = fences.begin() + fences_iterator;
-
-        auto it = std::find_if(hinted, fences.end(), Tick);
-        if (it == fences.end()) {
-            it = std::find_if(fences.begin(), hinted, Tick);
-            if (it == hinted) {
-                return nullptr;
-            }
-        }
-        fences_iterator = std::distance(fences.begin(), it) + 1;
-        if (fences_iterator >= fences.size())
-            fences_iterator = 0;
-
-        auto& fence = *it;
-        fence->Commit();
-        return fence.get();
-    };
-
-    VKFence* found_fence = StepFences(false, false);
-    if (!found_fence) {
-        // Try again, this time waiting.
-        found_fence = StepFences(true, false);
-
-        if (!found_fence) {
-            // Allocate new fences and try again.
-            LOG_INFO(Render_Vulkan, "Allocating new fences {} -> {}", fences.size(),
-                     fences.size() + FENCES_GROW_STEP);
-
-            GrowFences(FENCES_GROW_STEP);
-            found_fence = StepFences(true, false);
-            ASSERT(found_fence != nullptr);
-        }
-    }
-    return *found_fence;
-}
-
-VkCommandBuffer VKResourceManager::CommitCommandBuffer(VKFence& fence) {
-    return command_buffer_pool->Commit(fence);
-}
-
-void VKResourceManager::GrowFences(std::size_t new_fences_count) {
-    const std::size_t previous_size = fences.size();
-    fences.resize(previous_size + new_fences_count);
-
-    std::generate(fences.begin() + previous_size, fences.end(),
-                  [this] { return std::make_unique<VKFence>(device); });
-}
-
-} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h
deleted file mode 100644
index f683d2276..000000000
--- a/src/video_core/renderer_vulkan/vk_resource_manager.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-#include "video_core/renderer_vulkan/wrapper.h"
-
-namespace Vulkan {
-
-class VKDevice;
-class VKFence;
-class VKResourceManager;
-
-class CommandBufferPool;
-
-/// Interface for a Vulkan resource
-class VKResource {
-public:
-    explicit VKResource();
-    virtual ~VKResource();
-
-    /**
-     * Signals the object that an owning fence has been signaled.
-     * @param signaling_fence Fence that signals its usage end.
-     */
-    virtual void OnFenceRemoval(VKFence* signaling_fence) = 0;
-};
-
-/**
- * Fences take ownership of objects, protecting them from GPU-side or driver-side concurrent access.
- * They must be commited from the resource manager. Their usage flow is: commit the fence from the
- * resource manager, protect resources with it and use them, send the fence to an execution queue
- * and Wait for it if needed and then call Release. Used resources will automatically be signaled
- * when they are free to be reused.
- * @brief Protects resources for concurrent usage and signals its release.
- */
-class VKFence {
-    friend class VKResourceManager;
-
-public:
-    explicit VKFence(const VKDevice& device);
-    ~VKFence();
-
-    /**
-     * Waits for the fence to be signaled.
-     * @warning You must have ownership of the fence and it has to be previously sent to a queue to
-     * call this function.
-     */
-    void Wait();
-
-    /**
-     * Releases ownership of the fence. Pass after it has been sent to an execution queue.
-     * Unmanaged usage of the fence after the call will result in undefined behavior because it may
-     * be being used for something else.
-     */
-    void Release();
-
-    /// Protects a resource with this fence.
-    void Protect(VKResource* resource);
-
-    /// Removes protection for a resource.
-    void Unprotect(VKResource* resource);
-
-    /// Redirects one protected resource to a new address.
-    void RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept;
-
-    /// Retreives the fence.
-    operator VkFence() const {
-        return *handle;
-    }
-
-private:
-    /// Take ownership of the fence.
-    void Commit();
-
-    /**
-     * Updates the fence status.
-     * @warning Waiting for the owner might soft lock the execution.
-     * @param gpu_wait Wait for the fence to be signaled by the driver.
-     * @param owner_wait Wait for the owner to signal its freedom.
-     * @returns True if the fence is free. Waiting for gpu and owner will always return true.
-     */
-    bool Tick(bool gpu_wait, bool owner_wait);
-
-    const VKDevice& device;                       ///< Device handler
-    vk::Fence handle;                             ///< Vulkan fence
-    std::vector<VKResource*> protected_resources; ///< List of resources protected by this fence
-    bool is_owned = false; ///< The fence has been commited but not released yet.
-    bool is_used = false;  ///< The fence has been commited but it has not been checked to be free.
-};
-
-/**
- * A fence watch is used to keep track of the usage of a fence and protect a resource or set of
- * resources without having to inherit VKResource from their handlers.
- */
-class VKFenceWatch final : public VKResource {
-public:
-    explicit VKFenceWatch();
-    VKFenceWatch(VKFence& initial_fence);
-    VKFenceWatch(VKFenceWatch&&) noexcept;
-    VKFenceWatch(const VKFenceWatch&) = delete;
-    ~VKFenceWatch() override;
-
-    VKFenceWatch& operator=(VKFenceWatch&&) noexcept;
-
-    /// Waits for the fence to be released.
-    void Wait();
-
-    /**
-     * Waits for a previous fence and watches a new one.
-     * @param new_fence New fence to wait to.
-     */
-    void Watch(VKFence& new_fence);
-
-    /**
-     * Checks if it's currently being watched and starts watching it if it's available.
-     * @returns True if a watch has started, false if it's being watched.
-     */
-    bool TryWatch(VKFence& new_fence);
-
-    void OnFenceRemoval(VKFence* signaling_fence) override;
-
-    /**
-     * Do not use it paired with Watch. Use TryWatch instead.
-     * Returns true when the watch is free.
-     */
-    bool IsUsed() const {
-        return fence != nullptr;
-    }
-
-private:
-    VKFence* fence{}; ///< Fence watching this resource. nullptr when the watch is free.
-};
-
-/**
- * Handles a pool of resources protected by fences. Manages resource overflow allocating more
- * resources.
- */
-class VKFencedPool {
-public:
-    explicit VKFencedPool(std::size_t grow_step);
-    virtual ~VKFencedPool();
-
-protected:
-    /**
-     * Commits a free resource and protects it with a fence. It may allocate new resources.
-     * @param fence Fence that protects the commited resource.
-     * @returns Index of the resource commited.
-     */
-    std::size_t CommitResource(VKFence& fence);
-
-    /// Called when a chunk of resources have to be allocated.
-    virtual void Allocate(std::size_t begin, std::size_t end) = 0;
-
-private:
-    /// Manages pool overflow allocating new resources.
-    std::size_t ManageOverflow();
-
-    /// Allocates a new page of resources.
-    void Grow();
-
-    std::size_t grow_step = 0;     ///< Number of new resources created after an overflow
-    std::size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found
-    std::vector<std::unique_ptr<VKFenceWatch>> watches; ///< Set of watched resources
-};
-
-/**
- * The resource manager handles all resources that can be protected with a fence avoiding
- * driver-side or GPU-side concurrent usage. Usage is documented in VKFence.
- */
-class VKResourceManager final {
-public:
-    explicit VKResourceManager(const VKDevice& device);
-    ~VKResourceManager();
-
-    /// Commits a fence. It has to be sent to a queue and released.
-    VKFence& CommitFence();
-
-    /// Commits an unused command buffer and protects it with a fence.
-    VkCommandBuffer CommitCommandBuffer(VKFence& fence);
-
-private:
-    /// Allocates new fences.
-    void GrowFences(std::size_t new_fences_count);
-
-    const VKDevice& device;          ///< Device handler.
-    std::size_t fences_iterator = 0; ///< Index where a free fence is likely to be found.
-    std::vector<std::unique_ptr<VKFence>> fences;           ///< Pool of fences.
-    std::unique_ptr<CommandBufferPool> command_buffer_pool; ///< Pool of command buffers.
-};
-
-} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
new file mode 100644
index 000000000..ee274ac59
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
@@ -0,0 +1,63 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <optional>
+
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+
+namespace Vulkan {
+
+ResourcePool::ResourcePool(MasterSemaphore& master_semaphore_, size_t grow_step_)
+    : master_semaphore{master_semaphore_}, grow_step{grow_step_} {}
+
+ResourcePool::~ResourcePool() = default;
+
+size_t ResourcePool::CommitResource() {
+    // Refresh semaphore to query updated results
+    master_semaphore.Refresh();
+
+    const auto search = [this](size_t begin, size_t end) -> std::optional<size_t> {
+        for (size_t iterator = begin; iterator < end; ++iterator) {
+            if (master_semaphore.IsFree(ticks[iterator])) {
+                ticks[iterator] = master_semaphore.CurrentTick();
+                return iterator;
+            }
+        }
+        return {};
+    };
+    // Try to find a free resource from the hinted position to the end.
+    auto found = search(free_iterator, ticks.size());
+    if (!found) {
+        // Search from beginning to the hinted position.
+        found = search(0, free_iterator);
+        if (!found) {
+            // Both searches failed, the pool is full; handle it.
+            const size_t free_resource = ManageOverflow();
+
+            ticks[free_resource] = master_semaphore.CurrentTick();
+            found = free_resource;
+        }
+    }
+    // Free iterator is hinted to the resource after the one that's been commited.
+    free_iterator = (*found + 1) % ticks.size();
+    return *found;
+}
+
+size_t ResourcePool::ManageOverflow() {
+    const size_t old_capacity = ticks.size();
+    Grow();
+
+    // The last entry is guaranted to be free, since it's the first element of the freshly
+    // allocated resources.
+    return old_capacity;
+}
+
+void ResourcePool::Grow() {
+    const size_t old_capacity = ticks.size();
+    ticks.resize(old_capacity + grow_step);
+    Allocate(old_capacity, old_capacity + grow_step);
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h
new file mode 100644
index 000000000..a018c7ec2
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.h
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace Vulkan {
+
+class MasterSemaphore;
+
+/**
+ * Handles a pool of resources protected by fences. Manages resource overflow allocating more
+ * resources.
+ */
+class ResourcePool {
+public:
+    explicit ResourcePool(MasterSemaphore& master_semaphore, size_t grow_step);
+    virtual ~ResourcePool();
+
+protected:
+    size_t CommitResource();
+
+    /// Called when a chunk of resources have to be allocated.
+    virtual void Allocate(size_t begin, size_t end) = 0;
+
+private:
+    /// Manages pool overflow allocating new resources.
+    size_t ManageOverflow();
+
+    /// Allocates a new page of resources.
+    void Grow();
+
+    MasterSemaphore& master_semaphore;
+    size_t grow_step = 0;     ///< Number of new resources created after an overflow
+    size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found
+    std::vector<u64> ticks;   ///< Ticks for each resource
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index dbbd0961a..1a483dc71 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -10,9 +10,10 @@
 
 #include "common/microprofile.h"
 #include "common/thread.h"
+#include "video_core/renderer_vulkan/vk_command_pool.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/wrapper.h"
@@ -35,10 +36,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) {
     last = nullptr;
 }
 
-VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager,
-                         StateTracker& state_tracker)
-    : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker},
-      next_fence{&resource_manager.CommitFence()} {
+VKScheduler::VKScheduler(const VKDevice& device_, StateTracker& state_tracker_)
+    : device{device_}, state_tracker{state_tracker_},
+      master_semaphore{std::make_unique<MasterSemaphore>(device)},
+      command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
     AcquireNewChunk();
     AllocateNewContext();
     worker_thread = std::thread(&VKScheduler::WorkerThread, this);
@@ -50,20 +51,27 @@ VKScheduler::~VKScheduler() {
     worker_thread.join();
 }
 
-void VKScheduler::Flush(bool release_fence, VkSemaphore semaphore) {
+u64 VKScheduler::CurrentTick() const noexcept {
+    return master_semaphore->CurrentTick();
+}
+
+bool VKScheduler::IsFree(u64 tick) const noexcept {
+    return master_semaphore->IsFree(tick);
+}
+
+void VKScheduler::Wait(u64 tick) {
+    master_semaphore->Wait(tick);
+}
+
+void VKScheduler::Flush(VkSemaphore semaphore) {
     SubmitExecution(semaphore);
-    if (release_fence) {
-        current_fence->Release();
-    }
     AllocateNewContext();
 }
 
-void VKScheduler::Finish(bool release_fence, VkSemaphore semaphore) {
+void VKScheduler::Finish(VkSemaphore semaphore) {
+    const u64 presubmit_tick = CurrentTick();
     SubmitExecution(semaphore);
-    current_fence->Wait();
-    if (release_fence) {
-        current_fence->Release();
-    }
+    Wait(presubmit_tick);
     AllocateNewContext();
 }
 
@@ -160,18 +168,38 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) {
 
     current_cmdbuf.End();
 
+    const VkSemaphore timeline_semaphore = master_semaphore->Handle();
+    const u32 num_signal_semaphores = semaphore ? 2U : 1U;
+
+    const u64 signal_value = master_semaphore->CurrentTick();
+    const u64 wait_value = signal_value - 1;
+    const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+
+    master_semaphore->NextTick();
+
+    const std::array signal_values{signal_value, u64(0)};
+    const std::array signal_semaphores{timeline_semaphore, semaphore};
+
+    const VkTimelineSemaphoreSubmitInfoKHR timeline_si{
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
+        .pNext = nullptr,
+        .waitSemaphoreValueCount = 1,
+        .pWaitSemaphoreValues = &wait_value,
+        .signalSemaphoreValueCount = num_signal_semaphores,
+        .pSignalSemaphoreValues = signal_values.data(),
+    };
     const VkSubmitInfo submit_info{
         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .pNext = nullptr,
-        .waitSemaphoreCount = 0,
-        .pWaitSemaphores = nullptr,
-        .pWaitDstStageMask = nullptr,
+        .pNext = &timeline_si,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &timeline_semaphore,
+        .pWaitDstStageMask = &wait_stage_mask,
         .commandBufferCount = 1,
         .pCommandBuffers = current_cmdbuf.address(),
-        .signalSemaphoreCount = semaphore ? 1U : 0U,
-        .pSignalSemaphores = &semaphore,
+        .signalSemaphoreCount = num_signal_semaphores,
+        .pSignalSemaphores = signal_semaphores.data(),
     };
-    switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info, *current_fence)) {
+    switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) {
     case VK_SUCCESS:
         break;
     case VK_ERROR_DEVICE_LOST:
@@ -183,14 +211,9 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) {
 }
 
 void VKScheduler::AllocateNewContext() {
-    ++ticks;
-
     std::unique_lock lock{mutex};
-    current_fence = next_fence;
-    next_fence = &resource_manager.CommitFence();
 
-    current_cmdbuf = vk::CommandBuffer(resource_manager.CommitCommandBuffer(*current_fence),
-                                       device.GetDispatchLoader());
+    current_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader());
     current_cmdbuf.Begin({
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
         .pNext = nullptr,
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 970a65566..7be8a19f0 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -16,42 +16,33 @@
 
 namespace Vulkan {
 
+class CommandPool;
+class MasterSemaphore;
 class StateTracker;
 class VKDevice;
-class VKFence;
 class VKQueryCache;
-class VKResourceManager;
-
-class VKFenceView {
-public:
-    VKFenceView() = default;
-    VKFenceView(VKFence* const& fence) : fence{fence} {}
-
-    VKFence* operator->() const noexcept {
-        return fence;
-    }
-
-    operator VKFence&() const noexcept {
-        return *fence;
-    }
-
-private:
-    VKFence* const& fence;
-};
 
 /// The scheduler abstracts command buffer and fence management with an interface that's able to do
 /// OpenGL-like operations on Vulkan command buffers.
 class VKScheduler {
 public:
-    explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager,
-                         StateTracker& state_tracker);
+    explicit VKScheduler(const VKDevice& device, StateTracker& state_tracker);
     ~VKScheduler();
 
+    /// Returns the current command buffer tick.
+    [[nodiscard]] u64 CurrentTick() const noexcept;
+
+    /// Returns true when a tick has been triggered by the GPU.
+    [[nodiscard]] bool IsFree(u64 tick) const noexcept;
+
+    /// Waits for the given tick to trigger on the GPU.
+    void Wait(u64 tick);
+
     /// Sends the current execution context to the GPU.
-    void Flush(bool release_fence = true, VkSemaphore semaphore = nullptr);
+    void Flush(VkSemaphore semaphore = nullptr);
 
     /// Sends the current execution context to the GPU and waits for it to complete.
-    void Finish(bool release_fence = true, VkSemaphore semaphore = nullptr);
+    void Finish(VkSemaphore semaphore = nullptr);
 
     /// Waits for the worker thread to finish executing everything. After this function returns it's
     /// safe to touch worker resources.
@@ -86,14 +77,9 @@ public:
         (void)chunk->Record(command);
     }
 
-    /// Gets a reference to the current fence.
-    VKFenceView GetFence() const {
-        return current_fence;
-    }
-
-    /// Returns the current command buffer tick.
-    u64 Ticks() const {
-        return ticks;
+    /// Returns the master timeline semaphore.
+    [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
+        return *master_semaphore;
     }
 
 private:
@@ -171,6 +157,13 @@ private:
         std::array<u8, 0x8000> data{};
     };
 
+    struct State {
+        VkRenderPass renderpass = nullptr;
+        VkFramebuffer framebuffer = nullptr;
+        VkExtent2D render_area = {0, 0};
+        VkPipeline graphics_pipeline = nullptr;
+    };
+
     void WorkerThread();
 
     void SubmitExecution(VkSemaphore semaphore);
@@ -186,30 +179,23 @@ private:
     void AcquireNewChunk();
 
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     StateTracker& state_tracker;
 
+    std::unique_ptr<MasterSemaphore> master_semaphore;
+    std::unique_ptr<CommandPool> command_pool;
+
     VKQueryCache* query_cache = nullptr;
 
     vk::CommandBuffer current_cmdbuf;
-    VKFence* current_fence = nullptr;
-    VKFence* next_fence = nullptr;
-
-    struct State {
-        VkRenderPass renderpass = nullptr;
-        VkFramebuffer framebuffer = nullptr;
-        VkExtent2D render_area = {0, 0};
-        VkPipeline graphics_pipeline = nullptr;
-    } state;
 
     std::unique_ptr<CommandChunk> chunk;
     std::thread worker_thread;
 
+    State state;
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue;
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex mutex;
     std::condition_variable cv;
-    std::atomic<u64> ticks = 0;
     bool quit = false;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index cd7d7a4e4..a20452b87 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -272,12 +272,19 @@ bool IsPrecise(Operation operand) {
     return false;
 }
 
+u32 ShaderVersion(const VKDevice& device) {
+    if (device.InstanceApiVersion() < VK_API_VERSION_1_1) {
+        return 0x00010000;
+    }
+    return 0x00010300;
+}
+
 class SPIRVDecompiler final : public Sirit::Module {
 public:
     explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
                              const Registry& registry, const Specialization& specialization)
-        : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
-          registry{registry}, specialization{specialization} {
+        : Module(ShaderVersion(device)), device{device}, ir{ir}, stage{stage},
+          header{ir.GetHeader()}, registry{registry}, specialization{specialization} {
         if (stage != ShaderType::Compute) {
             transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
         }
@@ -293,6 +300,7 @@ public:
         AddCapability(spv::Capability::DrawParameters);
         AddCapability(spv::Capability::SubgroupBallotKHR);
         AddCapability(spv::Capability::SubgroupVoteKHR);
+        AddExtension("SPV_KHR_16bit_storage");
         AddExtension("SPV_KHR_shader_ballot");
         AddExtension("SPV_KHR_subgroup_vote");
         AddExtension("SPV_KHR_storage_buffer_storage_class");
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 5eca0ab91..2fd3b7f39 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -10,36 +10,18 @@
 #include "common/bit_util.h"
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
-VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence,
-                                                  u64 last_epoch)
-    : buffer{std::move(buffer)}, watch{fence}, last_epoch{last_epoch} {}
+VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer_)
+    : buffer{std::move(buffer_)} {}
 
-VKStagingBufferPool::StagingBuffer::StagingBuffer(StagingBuffer&& rhs) noexcept {
-    buffer = std::move(rhs.buffer);
-    watch = std::move(rhs.watch);
-    last_epoch = rhs.last_epoch;
-}
-
-VKStagingBufferPool::StagingBuffer::~StagingBuffer() = default;
-
-VKStagingBufferPool::StagingBuffer& VKStagingBufferPool::StagingBuffer::operator=(
-    StagingBuffer&& rhs) noexcept {
-    buffer = std::move(rhs.buffer);
-    watch = std::move(rhs.watch);
-    last_epoch = rhs.last_epoch;
-    return *this;
-}
-
-VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager,
-                                         VKScheduler& scheduler)
-    : device{device}, memory_manager{memory_manager}, scheduler{scheduler} {}
+VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device_, VKMemoryManager& memory_manager_,
+                                         VKScheduler& scheduler_)
+    : device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_} {}
 
 VKStagingBufferPool::~VKStagingBufferPool() = default;
 
@@ -51,7 +33,6 @@ VKBuffer& VKStagingBufferPool::GetUnusedBuffer(std::size_t size, bool host_visib
 }
 
 void VKStagingBufferPool::TickFrame() {
-    ++epoch;
     current_delete_level = (current_delete_level + 1) % NumLevels;
 
     ReleaseCache(true);
@@ -59,11 +40,12 @@ void VKStagingBufferPool::TickFrame() {
 }
 
 VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_visible) {
-    for (auto& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) {
-        if (entry.watch.TryWatch(scheduler.GetFence())) {
-            entry.last_epoch = epoch;
-            return &*entry.buffer;
+    for (StagingBuffer& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) {
+        if (!scheduler.IsFree(entry.tick)) {
+            continue;
         }
+        entry.tick = scheduler.CurrentTick();
+        return &*entry.buffer;
     }
     return nullptr;
 }
@@ -86,8 +68,10 @@ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_v
     });
     buffer->commit = memory_manager.Commit(buffer->handle, host_visible);
 
-    auto& entries = GetCache(host_visible)[log2].entries;
-    return *entries.emplace_back(std::move(buffer), scheduler.GetFence(), epoch).buffer;
+    std::vector<StagingBuffer>& entries = GetCache(host_visible)[log2].entries;
+    StagingBuffer& entry = entries.emplace_back(std::move(buffer));
+    entry.tick = scheduler.CurrentTick();
+    return *entry.buffer;
 }
 
 VKStagingBufferPool::StagingBuffersCache& VKStagingBufferPool::GetCache(bool host_visible) {
@@ -109,9 +93,8 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo
     auto& entries = staging.entries;
     const std::size_t old_size = entries.size();
 
-    const auto is_deleteable = [this](const auto& entry) {
-        static constexpr u64 epochs_to_destroy = 180;
-        return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed();
+    const auto is_deleteable = [this](const StagingBuffer& entry) {
+        return scheduler.IsFree(entry.tick);
     };
     const std::size_t begin_offset = staging.delete_index;
     const std::size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size);
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
index 3c4901437..2dd5049ac 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
@@ -10,13 +10,11 @@
 #include "common/common_types.h"
 
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
 class VKDevice;
-class VKFenceWatch;
 class VKScheduler;
 
 struct VKBuffer final {
@@ -36,16 +34,10 @@ public:
 
 private:
     struct StagingBuffer final {
-        explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, u64 last_epoch);
-        StagingBuffer(StagingBuffer&& rhs) noexcept;
-        StagingBuffer(const StagingBuffer&) = delete;
-        ~StagingBuffer();
-
-        StagingBuffer& operator=(StagingBuffer&& rhs) noexcept;
+        explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer);
 
         std::unique_ptr<VKBuffer> buffer;
-        VKFenceWatch watch;
-        u64 last_epoch = 0;
+        u64 tick = 0;
     };
 
     struct StagingBuffers final {
@@ -73,8 +65,6 @@ private:
     StagingBuffersCache host_staging_buffers;
     StagingBuffersCache device_staging_buffers;
 
-    u64 epoch = 0;
-
     std::size_t current_delete_level = 0;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index 9151d9fb1..5d2c4a796 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -42,7 +42,6 @@ Flags MakeInvalidationFlags() {
     flags[DepthWriteEnable] = true;
     flags[DepthCompareOp] = true;
     flags[FrontFace] = true;
-    flags[PrimitiveTopology] = true;
     flags[StencilOp] = true;
     flags[StencilTestEnable] = true;
     return flags;
@@ -112,10 +111,6 @@ void SetupDirtyFrontFace(Tables& tables) {
     table[OFF(screen_y_control)] = FrontFace;
 }
 
-void SetupDirtyPrimitiveTopology(Tables& tables) {
-    tables[0][OFF(draw.topology)] = PrimitiveTopology;
-}
-
 void SetupDirtyStencilOp(Tables& tables) {
     auto& table = tables[0];
     table[OFF(stencil_front_op_fail)] = StencilOp;
@@ -137,12 +132,9 @@ void SetupDirtyStencilTestEnable(Tables& tables) {
 
 } // Anonymous namespace
 
-StateTracker::StateTracker(Core::System& system)
-    : system{system}, invalidation_flags{MakeInvalidationFlags()} {}
-
-void StateTracker::Initialize() {
-    auto& dirty = system.GPU().Maxwell3D().dirty;
-    auto& tables = dirty.tables;
+StateTracker::StateTracker(Tegra::GPU& gpu)
+    : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
+    auto& tables = gpu.Maxwell3D().dirty.tables;
     SetupDirtyRenderTargets(tables);
     SetupDirtyViewports(tables);
     SetupDirtyScissors(tables);
@@ -156,13 +148,8 @@ void StateTracker::Initialize() {
     SetupDirtyDepthWriteEnable(tables);
     SetupDirtyDepthCompareOp(tables);
     SetupDirtyFrontFace(tables);
-    SetupDirtyPrimitiveTopology(tables);
     SetupDirtyStencilOp(tables);
     SetupDirtyStencilTestEnable(tables);
 }
 
-void StateTracker::InvalidateCommandBufferState() {
-    system.GPU().Maxwell3D().dirty.flags |= invalidation_flags;
-}
-
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index 54ca0d6c6..1de789e57 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -32,7 +32,6 @@ enum : u8 {
     DepthWriteEnable,
     DepthCompareOp,
     FrontFace,
-    PrimitiveTopology,
     StencilOp,
     StencilTestEnable,
 
@@ -43,12 +42,15 @@ static_assert(Last <= std::numeric_limits<u8>::max());
 } // namespace Dirty
 
 class StateTracker {
-public:
-    explicit StateTracker(Core::System& system);
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-    void Initialize();
+public:
+    explicit StateTracker(Tegra::GPU& gpu);
 
-    void InvalidateCommandBufferState();
+    void InvalidateCommandBufferState() {
+        flags |= invalidation_flags;
+        current_topology = INVALID_TOPOLOGY;
+    }
 
     bool TouchViewports() {
         return Exchange(Dirty::Viewports, false);
@@ -102,10 +104,6 @@ public:
         return Exchange(Dirty::FrontFace, false);
     }
 
-    bool TouchPrimitiveTopology() {
-        return Exchange(Dirty::PrimitiveTopology, false);
-    }
-
     bool TouchStencilOp() {
         return Exchange(Dirty::StencilOp, false);
     }
@@ -114,16 +112,24 @@ public:
         return Exchange(Dirty::StencilTestEnable, false);
     }
 
+    bool ChangePrimitiveTopology(Maxwell::PrimitiveTopology new_topology) {
+        const bool has_changed = current_topology != new_topology;
+        current_topology = new_topology;
+        return has_changed;
+    }
+
 private:
+    static constexpr auto INVALID_TOPOLOGY = static_cast<Maxwell::PrimitiveTopology>(~0u);
+
     bool Exchange(std::size_t id, bool new_value) const noexcept {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         const bool is_dirty = flags[id];
         flags[id] = new_value;
         return is_dirty;
     }
 
-    Core::System& system;
+    Tegra::Engines::Maxwell3D::DirtyState::Flags& flags;
     Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags;
+    Maxwell::PrimitiveTopology current_topology = INVALID_TOPOLOGY;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
index a5526a3f5..1b59612b9 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -11,7 +11,6 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/renderer_vulkan/wrapper.h"
@@ -57,9 +56,9 @@ u32 GetMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
 
 } // Anonymous namespace
 
-VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler,
+VKStreamBuffer::VKStreamBuffer(const VKDevice& device_, VKScheduler& scheduler_,
                                VkBufferUsageFlags usage)
-    : device{device}, scheduler{scheduler} {
+    : device{device_}, scheduler{scheduler_} {
     CreateBuffers(usage);
     ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
     ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
@@ -111,7 +110,7 @@ void VKStreamBuffer::Unmap(u64 size) {
     }
     auto& watch = current_watches[current_watch_cursor++];
     watch.upper_bound = offset;
-    watch.fence.Watch(scheduler.GetFence());
+    watch.tick = scheduler.CurrentTick();
 }
 
 void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) {
@@ -121,7 +120,8 @@ void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) {
 
     // Substract from the preferred heap size some bytes to avoid getting out of memory.
     const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
-    const VkDeviceSize allocable_size = heap_size - 9 * 1024 * 1024;
+    // As per DXVK's example, using `heap_size / 2`
+    const VkDeviceSize allocable_size = heap_size / 2;
     buffer = device.GetLogical().CreateBuffer({
         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
         .pNext = nullptr,
@@ -157,7 +157,7 @@ void VKStreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
     while (requested_upper_bound < wait_bound && wait_cursor < *invalidation_mark) {
         auto& watch = previous_watches[wait_cursor];
         wait_bound = watch.upper_bound;
-        watch.fence.Wait();
+        scheduler.Wait(watch.tick);
         ++wait_cursor;
     }
 }
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index 689f0d276..5e15ad78f 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -14,7 +14,6 @@
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
 class VKFenceWatch;
 class VKScheduler;
 
@@ -44,8 +43,8 @@ public:
     }
 
 private:
-    struct Watch final {
-        VKFenceWatch fence;
+    struct Watch {
+        u64 tick{};
         u64 upper_bound{};
     };
 
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index 6bfd2abae..9636a7c65 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -12,7 +12,7 @@
 #include "core/core.h"
 #include "core/frontend/framebuffer_layout.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
@@ -56,8 +56,8 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi
 
 } // Anonymous namespace
 
-VKSwapchain::VKSwapchain(VkSurfaceKHR surface, const VKDevice& device)
-    : surface{surface}, device{device} {}
+VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const VKDevice& device_, VKScheduler& scheduler_)
+    : surface{surface_}, device{device_}, scheduler{scheduler_} {}
 
 VKSwapchain::~VKSwapchain() = default;
 
@@ -75,21 +75,18 @@ void VKSwapchain::Create(u32 width, u32 height, bool srgb) {
     CreateSemaphores();
     CreateImageViews();
 
-    fences.resize(image_count, nullptr);
+    resource_ticks.clear();
+    resource_ticks.resize(image_count);
 }
 
 void VKSwapchain::AcquireNextImage() {
     device.GetLogical().AcquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(),
                                             *present_semaphores[frame_index], {}, &image_index);
 
-    if (auto& fence = fences[image_index]; fence) {
-        fence->Wait();
-        fence->Release();
-        fence = nullptr;
-    }
+    scheduler.Wait(resource_ticks[image_index]);
 }
 
-bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) {
+bool VKSwapchain::Present(VkSemaphore render_semaphore) {
     const VkSemaphore present_semaphore{*present_semaphores[frame_index]};
     const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore};
     const auto present_queue{device.GetPresentQueue()};
@@ -123,8 +120,7 @@ bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) {
         break;
     }
 
-    ASSERT(fences[image_index] == nullptr);
-    fences[image_index] = &fence;
+    resource_ticks[image_index] = scheduler.CurrentTick();
     frame_index = (frame_index + 1) % static_cast<u32>(image_count);
     return recreated;
 }
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
index a35d61345..6b39befdf 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -16,11 +16,11 @@ struct FramebufferLayout;
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
+class VKScheduler;
 
 class VKSwapchain {
 public:
-    explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device);
+    explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device, VKScheduler& scheduler);
     ~VKSwapchain();
 
     /// Creates (or recreates) the swapchain with a given size.
@@ -31,7 +31,7 @@ public:
 
     /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be
     /// recreated. Takes responsability for the ownership of fence.
-    bool Present(VkSemaphore render_semaphore, VKFence& fence);
+    bool Present(VkSemaphore render_semaphore);
 
     /// Returns true when the framebuffer layout has changed.
     bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const;
@@ -74,6 +74,7 @@ private:
 
     const VkSurfaceKHR surface;
     const VKDevice& device;
+    VKScheduler& scheduler;
 
     vk::SwapchainKHR swapchain;
 
@@ -81,7 +82,7 @@ private:
     std::vector<VkImage> images;
     std::vector<vk::ImageView> image_views;
     std::vector<vk::Framebuffer> framebuffers;
-    std::vector<VKFence*> fences;
+    std::vector<u64> resource_ticks;
     std::vector<vk::Semaphore> present_semaphores;
 
     u32 image_index{};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 2c6f54101..f2c8f2ae1 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -188,12 +188,10 @@ u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::Swizzl
 
 } // Anonymous namespace
 
-CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
-                             VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+CachedSurface::CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager,
                              VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
                              GPUVAddr gpu_addr, const SurfaceParams& params)
-    : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, system{system},
-      device{device}, resource_manager{resource_manager},
+    : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, device{device},
       memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} {
     if (params.IsBuffer()) {
         buffer = CreateBuffer(device, params, host_memory_size);
@@ -490,19 +488,20 @@ VkImageView CachedSurfaceView::GetAttachment() {
     return *render_target;
 }
 
-VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                               const VKDevice& device, VKResourceManager& resource_manager,
-                               VKMemoryManager& memory_manager, VKScheduler& scheduler,
-                               VKStagingBufferPool& staging_pool)
-    : TextureCache(system, rasterizer, device.IsOptimalAstcSupported()), device{device},
-      resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler},
-      staging_pool{staging_pool} {}
+VKTextureCache::VKTextureCache(VideoCore::RasterizerInterface& rasterizer,
+                               Tegra::Engines::Maxwell3D& maxwell3d,
+                               Tegra::MemoryManager& gpu_memory, const VKDevice& device_,
+                               VKMemoryManager& memory_manager_, VKScheduler& scheduler_,
+                               VKStagingBufferPool& staging_pool_)
+    : TextureCache(rasterizer, maxwell3d, gpu_memory, device_.IsOptimalAstcSupported()),
+      device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{
+                                                                                   staging_pool_} {}
 
 VKTextureCache::~VKTextureCache() = default;
 
 Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) {
-    return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager,
-                                           scheduler, staging_pool, gpu_addr, params);
+    return std::make_shared<CachedSurface>(device, memory_manager, scheduler, staging_pool,
+                                           gpu_addr, params);
 }
 
 void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 807e26c8a..39202feba 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -15,10 +15,6 @@
 #include "video_core/texture_cache/surface_base.h"
 #include "video_core/texture_cache/texture_cache.h"
 
-namespace Core {
-class System;
-}
-
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -27,7 +23,6 @@ namespace Vulkan {
 
 class RasterizerVulkan;
 class VKDevice;
-class VKResourceManager;
 class VKScheduler;
 class VKStagingBufferPool;
 
@@ -45,8 +40,7 @@ class CachedSurface final : public VideoCommon::SurfaceBase<View> {
     friend CachedSurfaceView;
 
 public:
-    explicit CachedSurface(Core::System& system, const VKDevice& device,
-                           VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+    explicit CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager,
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
                            GPUVAddr gpu_addr, const SurfaceParams& params);
     ~CachedSurface();
@@ -101,9 +95,7 @@ private:
 
     VkImageSubresourceRange GetImageSubresourceRange() const;
 
-    Core::System& system;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKScheduler& scheduler;
     VKStagingBufferPool& staging_pool;
@@ -201,10 +193,10 @@ private:
 
 class VKTextureCache final : public TextureCacheBase {
 public:
-    explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                            const VKDevice& device, VKResourceManager& resource_manager,
-                            VKMemoryManager& memory_manager, VKScheduler& scheduler,
-                            VKStagingBufferPool& staging_pool);
+    explicit VKTextureCache(VideoCore::RasterizerInterface& rasterizer,
+                            Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
+                            const VKDevice& device, VKMemoryManager& memory_manager,
+                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKTextureCache();
 
 private:
@@ -219,7 +211,6 @@ private:
     void BufferCopy(Surface& src_surface, Surface& dst_surface) override;
 
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKScheduler& scheduler;
     VKStagingBufferPool& staging_pool;
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 013865aa4..4e83303d8 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -6,10 +6,12 @@
 #include <exception>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/logging/log.h"
 
 #include "video_core/renderer_vulkan/wrapper.h"
 
@@ -17,21 +19,42 @@ namespace Vulkan::vk {
 
 namespace {
 
+template <typename Func>
+void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld,
+                         Func&& func) {
+    // Calling GetProperties calls Vulkan more than needed. But they are supposed to be cheap
+    // functions.
+    std::stable_sort(devices.begin(), devices.end(),
+                     [&dld, &func](VkPhysicalDevice lhs, VkPhysicalDevice rhs) {
+                         return func(vk::PhysicalDevice(lhs, dld).GetProperties(),
+                                     vk::PhysicalDevice(rhs, dld).GetProperties());
+                     });
+}
+
+void SortPhysicalDevicesPerVendor(std::vector<VkPhysicalDevice>& devices,
+                                  const InstanceDispatch& dld,
+                                  std::initializer_list<u32> vendor_ids) {
+    for (auto it = vendor_ids.end(); it != vendor_ids.begin();) {
+        --it;
+        SortPhysicalDevices(devices, dld, [id = *it](const auto& lhs, const auto& rhs) {
+            return lhs.vendorID == id && rhs.vendorID != id;
+        });
+    }
+}
+
 void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) {
-    std::stable_sort(devices.begin(), devices.end(), [&](auto lhs, auto rhs) {
-        // This will call Vulkan more than needed, but these calls are cheap.
-        const auto lhs_properties = vk::PhysicalDevice(lhs, dld).GetProperties();
-        const auto rhs_properties = vk::PhysicalDevice(rhs, dld).GetProperties();
-
-        // Prefer discrete GPUs, Nvidia over AMD, AMD over Intel, Intel over the rest.
-        const bool preferred =
-            (lhs_properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU &&
-             rhs_properties.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) ||
-            (lhs_properties.vendorID == 0x10DE && rhs_properties.vendorID != 0x10DE) ||
-            (lhs_properties.vendorID == 0x1002 && rhs_properties.vendorID != 0x1002) ||
-            (lhs_properties.vendorID == 0x8086 && rhs_properties.vendorID != 0x8086);
-        return !preferred;
+    // Sort by name, this will set a base and make GPUs with higher numbers appear first
+    // (e.g. GTX 1650 will intentionally be listed before a GTX 1080).
+    SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) {
+        return std::string_view{lhs.deviceName} > std::string_view{rhs.deviceName};
     });
+    // Prefer discrete over non-discrete
+    SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) {
+        return lhs.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU &&
+               rhs.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU;
+    });
+    // Prefer Nvidia over AMD, AMD over Intel, Intel over the rest.
+    SortPhysicalDevicesPerVendor(devices, dld, {0x10DE, 0x1002, 0x8086});
 }
 
 template <typename T>
@@ -148,6 +171,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkGetFenceStatus);
     X(vkGetImageMemoryRequirements);
     X(vkGetQueryPoolResults);
+    X(vkGetSemaphoreCounterValueKHR);
     X(vkMapMemory);
     X(vkQueueSubmit);
     X(vkResetFences);
@@ -156,6 +180,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkUpdateDescriptorSetWithTemplateKHR);
     X(vkUpdateDescriptorSets);
     X(vkWaitForFences);
+    X(vkWaitSemaphoresKHR);
 #undef X
 }
 
@@ -262,6 +287,22 @@ const char* ToString(VkResult result) noexcept {
         return "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT";
     case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT:
         return "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT";
+    case VkResult::VK_ERROR_UNKNOWN:
+        return "VK_ERROR_UNKNOWN";
+    case VkResult::VK_ERROR_INCOMPATIBLE_VERSION_KHR:
+        return "VK_ERROR_INCOMPATIBLE_VERSION_KHR";
+    case VkResult::VK_THREAD_IDLE_KHR:
+        return "VK_THREAD_IDLE_KHR";
+    case VkResult::VK_THREAD_DONE_KHR:
+        return "VK_THREAD_DONE_KHR";
+    case VkResult::VK_OPERATION_DEFERRED_KHR:
+        return "VK_OPERATION_DEFERRED_KHR";
+    case VkResult::VK_OPERATION_NOT_DEFERRED_KHR:
+        return "VK_OPERATION_NOT_DEFERRED_KHR";
+    case VkResult::VK_PIPELINE_COMPILE_REQUIRED_EXT:
+        return "VK_PIPELINE_COMPILE_REQUIRED_EXT";
+    case VkResult::VK_RESULT_MAX_ENUM:
+        return "VK_RESULT_MAX_ENUM";
     }
     return "Unknown";
 }
@@ -375,18 +416,17 @@ VkResult Free(VkDevice device, VkCommandPool handle, Span<VkCommandBuffer> buffe
     return VK_SUCCESS;
 }
 
-Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions,
+Instance Instance::Create(u32 version, Span<const char*> layers, Span<const char*> extensions,
                           InstanceDispatch& dld) noexcept {
-    static constexpr VkApplicationInfo application_info{
+    const VkApplicationInfo application_info{
         .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
         .pNext = nullptr,
         .pApplicationName = "yuzu Emulator",
         .applicationVersion = VK_MAKE_VERSION(0, 1, 0),
         .pEngineName = "yuzu Emulator",
         .engineVersion = VK_MAKE_VERSION(0, 1, 0),
-        .apiVersion = VK_API_VERSION_1_1,
+        .apiVersion = version,
     };
-
     const VkInstanceCreateInfo ci{
         .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
         .pNext = nullptr,
@@ -558,7 +598,10 @@ Semaphore Device::CreateSemaphore() const {
         .pNext = nullptr,
         .flags = 0,
     };
+    return CreateSemaphore(ci);
+}
 
+Semaphore Device::CreateSemaphore(const VkSemaphoreCreateInfo& ci) const {
     VkSemaphore object;
     Check(dld->vkCreateSemaphore(handle, &ci, nullptr, &object));
     return Semaphore(object, handle, *dld);
@@ -644,7 +687,7 @@ ShaderModule Device::CreateShaderModule(const VkShaderModuleCreateInfo& ci) cons
     return ShaderModule(object, handle, *dld);
 }
 
-Event Device::CreateNewEvent() const {
+Event Device::CreateEvent() const {
     static constexpr VkEventCreateInfo ci{
         .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO,
         .pNext = nullptr,
@@ -775,6 +818,21 @@ VkPhysicalDeviceMemoryProperties PhysicalDevice::GetMemoryProperties() const noe
     return properties;
 }
 
+u32 AvailableVersion(const InstanceDispatch& dld) noexcept {
+    PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion;
+    if (!Proc(vkEnumerateInstanceVersion, dld, "vkEnumerateInstanceVersion")) {
+        // If the procedure is not found, Vulkan 1.0 is assumed
+        return VK_API_VERSION_1_0;
+    }
+    u32 version;
+    if (const VkResult result = vkEnumerateInstanceVersion(&version); result != VK_SUCCESS) {
+        LOG_ERROR(Render_Vulkan, "vkEnumerateInstanceVersion returned {}, assuming Vulkan 1.1",
+                  ToString(result));
+        return VK_API_VERSION_1_1;
+    }
+    return version;
+}
+
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld) {
     u32 num;
@@ -786,7 +844,7 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
         VK_SUCCESS) {
         return std::nullopt;
     }
-    return std::move(properties);
+    return properties;
 }
 
 std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index b9d3fedc1..f64919623 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -267,6 +267,7 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkGetFenceStatus vkGetFenceStatus;
     PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements;
     PFN_vkGetQueryPoolResults vkGetQueryPoolResults;
+    PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR;
     PFN_vkMapMemory vkMapMemory;
     PFN_vkQueueSubmit vkQueueSubmit;
     PFN_vkResetFences vkResetFences;
@@ -275,6 +276,7 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
     PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets;
     PFN_vkWaitForFences vkWaitForFences;
+    PFN_vkWaitSemaphoresKHR vkWaitSemaphoresKHR;
 };
 
 /// Loads instance agnostic function pointers.
@@ -550,7 +552,6 @@ using PipelineLayout = Handle<VkPipelineLayout, VkDevice, DeviceDispatch>;
 using QueryPool = Handle<VkQueryPool, VkDevice, DeviceDispatch>;
 using RenderPass = Handle<VkRenderPass, VkDevice, DeviceDispatch>;
 using Sampler = Handle<VkSampler, VkDevice, DeviceDispatch>;
-using Semaphore = Handle<VkSemaphore, VkDevice, DeviceDispatch>;
 using ShaderModule = Handle<VkShaderModule, VkDevice, DeviceDispatch>;
 using SurfaceKHR = Handle<VkSurfaceKHR, VkInstance, InstanceDispatch>;
 
@@ -563,7 +564,7 @@ class Instance : public Handle<VkInstance, NoOwner, InstanceDispatch> {
 
 public:
     /// Creates a Vulkan instance. Use "operator bool" for error handling.
-    static Instance Create(Span<const char*> layers, Span<const char*> extensions,
+    static Instance Create(u32 version, Span<const char*> layers, Span<const char*> extensions,
                            InstanceDispatch& dld) noexcept;
 
     /// Enumerates physical devices.
@@ -582,7 +583,8 @@ public:
     /// Construct a queue handle.
     constexpr Queue(VkQueue queue, const DeviceDispatch& dld) noexcept : queue{queue}, dld{&dld} {}
 
-    VkResult Submit(Span<VkSubmitInfo> submit_infos, VkFence fence) const noexcept {
+    VkResult Submit(Span<VkSubmitInfo> submit_infos,
+                    VkFence fence = VK_NULL_HANDLE) const noexcept {
         return dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence);
     }
 
@@ -674,6 +676,44 @@ public:
     }
 };
 
+class Semaphore : public Handle<VkSemaphore, VkDevice, DeviceDispatch> {
+    using Handle<VkSemaphore, VkDevice, DeviceDispatch>::Handle;
+
+public:
+    [[nodiscard]] u64 GetCounter() const {
+        u64 value;
+        Check(dld->vkGetSemaphoreCounterValueKHR(owner, handle, &value));
+        return value;
+    }
+
+    /**
+     * Waits for a timeline semaphore on the host.
+     *
+     * @param value   Value to wait
+     * @param timeout Time in nanoseconds to timeout
+     * @return        True on successful wait, false on timeout
+     */
+    bool Wait(u64 value, u64 timeout = std::numeric_limits<u64>::max()) const {
+        const VkSemaphoreWaitInfoKHR wait_info{
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR,
+            .pNext = nullptr,
+            .flags = 0,
+            .semaphoreCount = 1,
+            .pSemaphores = &handle,
+            .pValues = &value,
+        };
+        const VkResult result = dld->vkWaitSemaphoresKHR(owner, &wait_info, timeout);
+        switch (result) {
+        case VK_SUCCESS:
+            return true;
+        case VK_TIMEOUT:
+            return false;
+        default:
+            throw Exception(result);
+        }
+    }
+};
+
 class Device : public Handle<VkDevice, NoOwner, DeviceDispatch> {
     using Handle<VkDevice, NoOwner, DeviceDispatch>::Handle;
 
@@ -694,6 +734,8 @@ public:
 
     Semaphore CreateSemaphore() const;
 
+    Semaphore CreateSemaphore(const VkSemaphoreCreateInfo& ci) const;
+
     Fence CreateFence(const VkFenceCreateInfo& ci) const;
 
     DescriptorPool CreateDescriptorPool(const VkDescriptorPoolCreateInfo& ci) const;
@@ -721,7 +763,7 @@ public:
 
     ShaderModule CreateShaderModule(const VkShaderModuleCreateInfo& ci) const;
 
-    Event CreateNewEvent() const;
+    Event CreateEvent() const;
 
     SwapchainKHR CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const;
 
@@ -1048,6 +1090,8 @@ private:
     const DeviceDispatch* dld;
 };
 
+u32 AvailableVersion(const InstanceDispatch& dld) noexcept;
+
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h
index cca13bcde..8e5a22ab3 100644
--- a/src/video_core/shader/ast.h
+++ b/src/video_core/shader/ast.h
@@ -199,55 +199,48 @@ public:
     }
 
     std::optional<u32> GetGotoLabel() const {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTGoto>(&data)) {
             return {inner->label};
         }
-        return {};
+        return std::nullopt;
     }
 
     Expr GetGotoCondition() const {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTGoto>(&data)) {
             return inner->condition;
         }
         return nullptr;
     }
 
     void MarkLabelUnused() {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (auto* inner = std::get_if<ASTLabel>(&data)) {
             inner->unused = true;
         }
     }
 
     bool IsLabelUnused() const {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTLabel>(&data)) {
             return inner->unused;
         }
         return true;
     }
 
     std::optional<u32> GetLabelIndex() const {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTLabel>(&data)) {
             return {inner->index};
         }
-        return {};
+        return std::nullopt;
     }
 
     Expr GetIfCondition() const {
-        auto inner = std::get_if<ASTIfThen>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTIfThen>(&data)) {
             return inner->condition;
         }
         return nullptr;
     }
 
     void SetGotoCondition(Expr new_condition) {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (auto* inner = std::get_if<ASTGoto>(&data)) {
             inner->condition = std::move(new_condition);
         }
     }
diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp
index f815584f7..6920afdf2 100644
--- a/src/video_core/shader/async_shaders.cpp
+++ b/src/video_core/shader/async_shaders.cpp
@@ -20,14 +20,15 @@ AsyncShaders::~AsyncShaders() {
 }
 
 void AsyncShaders::AllocateWorkers() {
-    // Max worker threads we should allow
-    constexpr u32 MAX_THREADS = 4;
-    // Deduce how many threads we can use
-    const u32 threads_used = std::thread::hardware_concurrency() / 4;
-    // Always allow at least 1 thread regardless of our settings
-    const auto max_worker_count = std::max(1U, threads_used);
-    // Don't use more than MAX_THREADS
-    const auto num_workers = std::min(max_worker_count, MAX_THREADS);
+    // Use at least one thread
+    u32 num_workers = 1;
+
+    // Deduce how many more threads we can use
+    const u32 thread_count = std::thread::hardware_concurrency();
+    if (thread_count >= 8) {
+        // Increase async workers by 1 for every 2 threads >= 8
+        num_workers += 1 + (thread_count - 8) / 2;
+    }
 
     // If we already have workers queued, ignore
     if (num_workers == worker_threads.size()) {
@@ -42,8 +43,8 @@ void AsyncShaders::AllocateWorkers() {
     // Create workers
     for (std::size_t i = 0; i < num_workers; i++) {
         context_list.push_back(emu_window.CreateSharedContext());
-        worker_threads.push_back(
-            std::thread(&AsyncShaders::ShaderCompilerThread, this, context_list[i].get()));
+        worker_threads.emplace_back(&AsyncShaders::ShaderCompilerThread, this,
+                                    context_list[i].get());
     }
 }
 
@@ -73,11 +74,11 @@ void AsyncShaders::KillWorkers() {
     worker_threads.clear();
 }
 
-bool AsyncShaders::HasWorkQueued() {
+bool AsyncShaders::HasWorkQueued() const {
     return !pending_queue.empty();
 }
 
-bool AsyncShaders::HasCompletedWork() {
+bool AsyncShaders::HasCompletedWork() const {
     std::shared_lock lock{completed_mutex};
     return !finished_work.empty();
 }
@@ -102,11 +103,10 @@ bool AsyncShaders::IsShaderAsync(const Tegra::GPU& gpu) const {
 }
 
 std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() {
-    std::vector<AsyncShaders::Result> results;
+    std::vector<Result> results;
     {
         std::unique_lock lock{completed_mutex};
-        results.assign(std::make_move_iterator(finished_work.begin()),
-                       std::make_move_iterator(finished_work.end()));
+        results = std::move(finished_work);
         finished_work.clear();
     }
     return results;
@@ -115,11 +115,10 @@ std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() {
 void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device,
                                      Tegra::Engines::ShaderType shader_type, u64 uid,
                                      std::vector<u64> code, std::vector<u64> code_b,
-                                     u32 main_offset,
-                                     VideoCommon::Shader::CompilerSettings compiler_settings,
-                                     const VideoCommon::Shader::Registry& registry,
-                                     VAddr cpu_addr) {
-    WorkerParams params{
+                                     u32 main_offset, CompilerSettings compiler_settings,
+                                     const Registry& registry, VAddr cpu_addr) {
+    std::unique_lock lock(queue_mutex);
+    pending_queue.push({
         .backend = device.UseAssemblyShaders() ? Backend::GLASM : Backend::OpenGL,
         .device = &device,
         .shader_type = shader_type,
@@ -130,9 +129,7 @@ void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device,
         .compiler_settings = compiler_settings,
         .registry = registry,
         .cpu_address = cpu_addr,
-    };
-    std::unique_lock lock(queue_mutex);
-    pending_queue.push(std::move(params));
+    });
     cv.notify_one();
 }
 
@@ -144,7 +141,8 @@ void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache,
                                      std::vector<VkDescriptorSetLayoutBinding> bindings,
                                      Vulkan::SPIRVProgram program,
                                      Vulkan::GraphicsPipelineCacheKey key) {
-    WorkerParams params{
+    std::unique_lock lock(queue_mutex);
+    pending_queue.push({
         .backend = Backend::Vulkan,
         .pp_cache = pp_cache,
         .vk_device = &device,
@@ -152,13 +150,10 @@ void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache,
         .descriptor_pool = &descriptor_pool,
         .update_descriptor_queue = &update_descriptor_queue,
         .renderpass_cache = &renderpass_cache,
-        .bindings = bindings,
-        .program = program,
+        .bindings = std::move(bindings),
+        .program = std::move(program),
         .key = key,
-    };
-
-    std::unique_lock lock(queue_mutex);
-    pending_queue.push(std::move(params));
+    });
     cv.notify_one();
 }
 
diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h
index d5ae814d5..7a99e1dc5 100644
--- a/src/video_core/shader/async_shaders.h
+++ b/src/video_core/shader/async_shaders.h
@@ -5,11 +5,21 @@
 #pragma once
 
 #include <condition_variable>
-#include <deque>
 #include <memory>
 #include <shared_mutex>
 #include <thread>
-#include "common/bit_field.h"
+
+// This header includes both Vulkan and OpenGL headers, this has to be fixed
+// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
+// Forcefully include glad early and undefine macros
+#include <glad/glad.h>
+#ifdef CreateEvent
+#undef CreateEvent
+#endif
+#ifdef CreateSemaphore
+#undef CreateSemaphore
+#endif
+
 #include "common/common_types.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -17,7 +27,6 @@
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_update_descriptor.h"
 
 namespace Core::Frontend {
 class EmuWindow;
@@ -70,20 +79,20 @@ public:
     void KillWorkers();
 
     /// Check to see if any shaders have actually been compiled
-    bool HasCompletedWork();
+    [[nodiscard]] bool HasCompletedWork() const;
 
     /// Deduce if a shader can be build on another thread of MUST be built in sync. We cannot build
     /// every shader async as some shaders are only built and executed once. We try to "guess" which
     /// shader would be used only once
-    bool IsShaderAsync(const Tegra::GPU& gpu) const;
+    [[nodiscard]] bool IsShaderAsync(const Tegra::GPU& gpu) const;
 
     /// Pulls completed compiled shaders
-    std::vector<Result> GetCompletedWork();
+    [[nodiscard]] std::vector<Result> GetCompletedWork();
 
     void QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type,
                            u64 uid, std::vector<u64> code, std::vector<u64> code_b, u32 main_offset,
-                           VideoCommon::Shader::CompilerSettings compiler_settings,
-                           const VideoCommon::Shader::Registry& registry, VAddr cpu_addr);
+                           CompilerSettings compiler_settings, const Registry& registry,
+                           VAddr cpu_addr);
 
     void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::VKDevice& device,
                            Vulkan::VKScheduler& scheduler,
@@ -97,7 +106,7 @@ private:
     void ShaderCompilerThread(Core::Frontend::GraphicsContext* context);
 
     /// Check our worker queue to see if we have any work queued already
-    bool HasWorkQueued();
+    [[nodiscard]] bool HasWorkQueued() const;
 
     struct WorkerParams {
         Backend backend;
@@ -108,8 +117,8 @@ private:
         std::vector<u64> code;
         std::vector<u64> code_b;
         u32 main_offset;
-        VideoCommon::Shader::CompilerSettings compiler_settings;
-        std::optional<VideoCommon::Shader::Registry> registry;
+        CompilerSettings compiler_settings;
+        std::optional<Registry> registry;
         VAddr cpu_address;
 
         // For Vulkan
@@ -125,13 +134,13 @@ private:
     };
 
     std::condition_variable cv;
-    std::mutex queue_mutex;
-    std::shared_mutex completed_mutex;
+    mutable std::mutex queue_mutex;
+    mutable std::shared_mutex completed_mutex;
     std::atomic<bool> is_thread_exiting{};
     std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> context_list;
     std::vector<std::thread> worker_threads;
     std::queue<WorkerParams> pending_queue;
-    std::vector<AsyncShaders::Result> finished_work;
+    std::vector<Result> finished_work;
     Core::Frontend::EmuWindow& emu_window;
 };
 
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index 336397cdb..4c8971615 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -547,13 +547,13 @@ bool TryQuery(CFGRebuildState& state) {
     gather_labels(q2.ssy_stack, state.ssy_labels, block);
     gather_labels(q2.pbk_stack, state.pbk_labels, block);
     if (std::holds_alternative<SingleBranch>(*block.branch)) {
-        const auto branch = std::get_if<SingleBranch>(block.branch.get());
+        auto* branch = std::get_if<SingleBranch>(block.branch.get());
         if (!branch->condition.IsUnconditional()) {
             q2.address = block.end + 1;
             state.queries.push_back(q2);
         }
 
-        Query conditional_query{q2};
+        auto& conditional_query = state.queries.emplace_back(q2);
         if (branch->is_sync) {
             if (branch->address == unassigned_branch) {
                 branch->address = conditional_query.ssy_stack.top();
@@ -567,21 +567,21 @@ bool TryQuery(CFGRebuildState& state) {
             conditional_query.pbk_stack.pop();
         }
         conditional_query.address = branch->address;
-        state.queries.push_back(std::move(conditional_query));
         return true;
     }
-    const auto multi_branch = std::get_if<MultiBranch>(block.branch.get());
+
+    const auto* multi_branch = std::get_if<MultiBranch>(block.branch.get());
     for (const auto& branch_case : multi_branch->branches) {
-        Query conditional_query{q2};
+        auto& conditional_query = state.queries.emplace_back(q2);
         conditional_query.address = branch_case.address;
-        state.queries.push_back(std::move(conditional_query));
     }
+
     return true;
 }
 
 void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
-    const auto get_expr = ([&](const Condition& cond) -> Expr {
-        Expr result{};
+    const auto get_expr = [](const Condition& cond) -> Expr {
+        Expr result;
         if (cond.cc != ConditionCode::T) {
             result = MakeExpr<ExprCondCode>(cond.cc);
         }
@@ -594,10 +594,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
             }
             Expr extra = MakeExpr<ExprPredicate>(pred);
             if (negate) {
-                extra = MakeExpr<ExprNot>(extra);
+                extra = MakeExpr<ExprNot>(std::move(extra));
             }
             if (result) {
-                return MakeExpr<ExprAnd>(extra, result);
+                return MakeExpr<ExprAnd>(std::move(extra), std::move(result));
             }
             return extra;
         }
@@ -605,9 +605,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
             return result;
         }
         return MakeExpr<ExprBoolean>(true);
-    });
+    };
+
     if (std::holds_alternative<SingleBranch>(*branch_info)) {
-        const auto branch = std::get_if<SingleBranch>(branch_info.get());
+        const auto* branch = std::get_if<SingleBranch>(branch_info.get());
         if (branch->address < 0) {
             if (branch->kill) {
                 mm.InsertReturn(get_expr(branch->condition), true);
@@ -619,7 +620,7 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
         mm.InsertGoto(get_expr(branch->condition), branch->address);
         return;
     }
-    const auto multi_branch = std::get_if<MultiBranch>(branch_info.get());
+    const auto* multi_branch = std::get_if<MultiBranch>(branch_info.get());
     for (const auto& branch_case : multi_branch->branches) {
         mm.InsertGoto(MakeExpr<ExprGprEqual>(multi_branch->gpr, branch_case.cmp_value),
                       branch_case.address);
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 4db329fa5..afef5948d 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -137,7 +137,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::FCMP_RR:
-    case OpCode::Id::FCMP_RC: {
+    case OpCode::Id::FCMP_RC:
+    case OpCode::Id::FCMP_IMMR: {
         UNIMPLEMENTED_IF(instr.fcmp.ftz == 0);
         Node op_c = GetRegister(instr.gpr39);
         Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f));
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index a276aee44..88103fede 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -53,6 +53,9 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
         absolute_a = ((instr.value >> 44) & 1) != 0;
         absolute_b = ((instr.value >> 54) & 1) != 0;
         break;
+    default:
+        UNREACHABLE();
+        break;
     }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
index 73880db0e..2a30aab2b 100644
--- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
@@ -28,23 +28,26 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) {
     case OpCode::Id::IADD32I: {
         UNIMPLEMENTED_IF_MSG(instr.iadd32i.saturate, "IADD32I saturation is not implemented");
 
-        op_a = GetOperandAbsNegInteger(op_a, false, instr.iadd32i.negate_a, true);
+        op_a = GetOperandAbsNegInteger(std::move(op_a), false, instr.iadd32i.negate_a != 0, true);
 
-        const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b);
+        Node value = Operation(OperationCode::IAdd, PRECISE, std::move(op_a), std::move(op_b));
 
-        SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc != 0);
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::LOP32I: {
-        if (instr.alu.lop32i.invert_a)
-            op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_a);
+        if (instr.alu.lop32i.invert_a) {
+            op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_a));
+        }
 
-        if (instr.alu.lop32i.invert_b)
-            op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_b);
+        if (instr.alu.lop32i.invert_b) {
+            op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b));
+        }
 
-        WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, op_a, op_b,
-                            PredicateResultMode::None, Pred::UnusedIndex, instr.op_32.generates_cc);
+        WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, std::move(op_a),
+                            std::move(op_b), PredicateResultMode::None, Pred::UnusedIndex,
+                            instr.op_32.generates_cc != 0);
         break;
     }
     default:
@@ -58,14 +61,14 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) {
 void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation logic_op, Node op_a,
                                    Node op_b, PredicateResultMode predicate_mode, Pred predicate,
                                    bool sets_cc) {
-    const Node result = [&]() {
+    Node result = [&] {
         switch (logic_op) {
         case LogicOperation::And:
-            return Operation(OperationCode::IBitwiseAnd, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseAnd, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::Or:
-            return Operation(OperationCode::IBitwiseOr, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseOr, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::Xor:
-            return Operation(OperationCode::IBitwiseXor, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseXor, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::PassB:
             return op_b;
         default:
@@ -84,8 +87,8 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation
         return;
     case PredicateResultMode::NotZero: {
         // Set the predicate to true if the result is not zero.
-        const Node compare = Operation(OperationCode::LogicalINotEqual, result, Immediate(0));
-        SetPredicate(bb, static_cast<u64>(predicate), compare);
+        Node compare = Operation(OperationCode::LogicalINotEqual, std::move(result), Immediate(0));
+        SetPredicate(bb, static_cast<u64>(predicate), std::move(compare));
         break;
     }
     default:
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index e75ca4fdb..618d309d2 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -119,6 +119,8 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
             return descriptor.r_type;
         }
         break;
+    default:
+        break;
     }
     UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
@@ -220,9 +222,10 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
+    default:
+        UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+        return 0;
     }
-    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
-    return 0;
 }
 
 std::size_t GetImageComponentMask(TextureFormat format) {
@@ -257,9 +260,10 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R8:
     case TextureFormat::R1:
         return std::size_t{R};
+    default:
+        UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+        return std::size_t{R | G | B | A};
     }
-    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
-    return std::size_t{R | G | B | A};
 }
 
 std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -463,7 +467,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
                     return OperationCode::AtomicImageXor;
                 case Tegra::Shader::ImageAtomicOperation::Exch:
                     return OperationCode::AtomicImageExchange;
+                default:
+                    break;
                 }
+                break;
             default:
                 break;
             }
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index e4739394d..e2bba88dd 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -386,7 +386,8 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::RED: {
-        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
+        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32, "type={}",
+                             static_cast<int>(instr.red.type.Value()));
         const auto [real_address, base_address, descriptor] =
             TrackGlobalMemory(bb, instr, true, true);
         if (!real_address || !base_address) {
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 29ebf65ba..02fdccd86 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -292,33 +292,36 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             break;
         }
 
-        std::vector<Node> coords;
-
-        // TODO: Add coordinates for different samplers once other texture types are implemented.
-        switch (texture_type) {
-        case TextureType::Texture1D:
-            coords.push_back(GetRegister(instr.gpr8));
-            break;
-        case TextureType::Texture2D:
-            coords.push_back(GetRegister(instr.gpr8.Value() + 0));
-            coords.push_back(GetRegister(instr.gpr8.Value() + 1));
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type));
+        const u64 base_index = is_array ? 1 : 0;
+        const u64 num_components = [texture_type] {
+            switch (texture_type) {
+            case TextureType::Texture1D:
+                return 1;
+            case TextureType::Texture2D:
+                return 2;
+            case TextureType::TextureCube:
+                return 3;
+            default:
+                UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type));
+                return 2;
+            }
+        }();
+        // TODO: What's the array component used for?
 
-            // Fallback to interpreting as a 2D texture for now
-            coords.push_back(GetRegister(instr.gpr8.Value() + 0));
-            coords.push_back(GetRegister(instr.gpr8.Value() + 1));
+        std::vector<Node> coords;
+        coords.reserve(num_components);
+        for (u64 component = 0; component < num_components; ++component) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + base_index + component));
         }
+
         u32 indexer = 0;
         for (u32 element = 0; element < 2; ++element) {
             if (!instr.tmml.IsComponentEnabled(element)) {
                 continue;
             }
-            auto params = coords;
             MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var};
-            const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporary(bb, indexer++, value);
+            Node value = Operation(OperationCode::TextureQueryLod, meta, coords);
+            SetTemporary(bb, indexer++, std::move(value));
         }
         for (u32 i = 0; i < indexer; ++i) {
             SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
@@ -553,7 +556,6 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
     const bool is_shadow = depth_compare != nullptr;
     const bool is_bindless = bindless_reg.has_value();
 
-    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
     ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow,
                "Illegal texture type");
 
@@ -763,7 +765,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
 
 Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
     const auto texture_type{instr.tld.texture_type};
-    const bool is_array{instr.tld.is_array};
+    const bool is_array{instr.tld.is_array != 0};
     const bool lod_enabled{instr.tld.GetTextureProcessMode() == TextureProcessMode::LL};
     const std::size_t coord_count{GetCoordCount(texture_type)};
 
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
index 5071c83ca..e18ccba8e 100644
--- a/src/video_core/shader/memory_util.cpp
+++ b/src/video_core/shader/memory_util.cpp
@@ -16,11 +16,10 @@
 
 namespace VideoCommon::Shader {
 
-GPUVAddr GetShaderAddress(Core::System& system,
+GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d,
                           Tegra::Engines::Maxwell3D::Regs::ShaderProgram program) {
-    const auto& gpu{system.GPU().Maxwell3D()};
-    const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
-    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
+    const auto& shader_config{maxwell3d.regs.shader_config[static_cast<std::size_t>(program)]};
+    return maxwell3d.regs.code_address.CodeAddress() + shader_config.offset;
 }
 
 bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
diff --git a/src/video_core/shader/memory_util.h b/src/video_core/shader/memory_util.h
index be90d24fd..4624d38e6 100644
--- a/src/video_core/shader/memory_util.h
+++ b/src/video_core/shader/memory_util.h
@@ -11,10 +11,6 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 
-namespace Core {
-class System;
-}
-
 namespace Tegra {
 class MemoryManager;
 }
@@ -27,7 +23,7 @@ constexpr u32 STAGE_MAIN_OFFSET = 10;
 constexpr u32 KERNEL_MAIN_OFFSET = 0;
 
 /// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Core::System& system,
+GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d,
                           Tegra::Engines::Maxwell3D::Regs::ShaderProgram program);
 
 /// Gets if the current instruction offset is a scheduler instruction
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index cdf274e54..148d91fcb 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -24,44 +24,45 @@ GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterfac
     if (shader_stage == ShaderType::Compute) {
         return {};
     }
-    auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine);
-
-    GraphicsInfo info;
-    info.tfb_layouts = graphics.regs.tfb_layouts;
-    info.tfb_varying_locs = graphics.regs.tfb_varying_locs;
-    info.primitive_topology = graphics.regs.draw.topology;
-    info.tessellation_primitive = graphics.regs.tess_mode.prim;
-    info.tessellation_spacing = graphics.regs.tess_mode.spacing;
-    info.tfb_enabled = graphics.regs.tfb_enabled;
-    info.tessellation_clockwise = graphics.regs.tess_mode.cw;
-    return info;
+
+    auto& graphics = dynamic_cast<Tegra::Engines::Maxwell3D&>(engine);
+
+    return {
+        .tfb_layouts = graphics.regs.tfb_layouts,
+        .tfb_varying_locs = graphics.regs.tfb_varying_locs,
+        .primitive_topology = graphics.regs.draw.topology,
+        .tessellation_primitive = graphics.regs.tess_mode.prim,
+        .tessellation_spacing = graphics.regs.tess_mode.spacing,
+        .tfb_enabled = graphics.regs.tfb_enabled != 0,
+        .tessellation_clockwise = graphics.regs.tess_mode.cw.Value() != 0,
+    };
 }
 
 ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
     if (shader_stage != ShaderType::Compute) {
         return {};
     }
-    auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine);
+
+    auto& compute = dynamic_cast<Tegra::Engines::KeplerCompute&>(engine);
     const auto& launch = compute.launch_description;
 
-    ComputeInfo info;
-    info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z};
-    info.local_memory_size_in_words = launch.local_pos_alloc;
-    info.shared_memory_size_in_words = launch.shared_alloc;
-    return info;
+    return {
+        .workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z},
+        .shared_memory_size_in_words = launch.shared_alloc,
+        .local_memory_size_in_words = launch.local_pos_alloc,
+    };
 }
 
 } // Anonymous namespace
 
-Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info)
+Registry::Registry(ShaderType shader_stage, const SerializedRegistryInfo& info)
     : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile},
       bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {}
 
-Registry::Registry(Tegra::Engines::ShaderType shader_stage,
-                   Tegra::Engines::ConstBufferEngineInterface& engine)
-    : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()},
-      graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo(
-                                                                 shader_stage, engine)} {}
+Registry::Registry(ShaderType shader_stage, ConstBufferEngineInterface& engine_)
+    : stage{shader_stage}, engine{&engine_}, bound_buffer{engine_.GetBoundBuffer()},
+      graphics_info{MakeGraphicsInfo(shader_stage, engine_)}, compute_info{MakeComputeInfo(
+                                                                  shader_stage, engine_)} {}
 
 Registry::~Registry() = default;
 
@@ -113,8 +114,7 @@ std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler
     return value;
 }
 
-std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
-                                                                                 u32 offset) {
+std::optional<SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) {
     const std::pair key = {buffer, offset};
     const auto iter = bindless_samplers.find(key);
     if (iter != bindless_samplers.end()) {
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 231206765..4bebefdde 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -94,7 +94,7 @@ public:
     explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info);
 
     explicit Registry(Tegra::Engines::ShaderType shader_stage,
-                      Tegra::Engines::ConstBufferEngineInterface& engine);
+                      Tegra::Engines::ConstBufferEngineInterface& engine_);
 
     ~Registry();
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index d5ed81442..6be3ea92b 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -205,12 +205,12 @@ std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code,
     const auto result = TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
     const auto& found = result.first;
     if (!found) {
-        return {};
+        return std::nullopt;
     }
     if (const auto immediate = std::get_if<ImmediateNode>(&*found)) {
         return immediate->GetValue();
     }
-    return {};
+    return std::nullopt;
 }
 
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index dfcf36e0b..b44c09d71 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -115,20 +115,24 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     if (gpu_addr == candidate_gpu_addr) {
         return {{0, 0}};
     }
+
     if (candidate_gpu_addr < gpu_addr) {
-        return {};
+        return std::nullopt;
     }
+
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
     if (layer >= params.depth) {
-        return {};
+        return std::nullopt;
     }
+
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
     if (mipmap_it == mipmap_offsets.end()) {
-        return {};
+        return std::nullopt;
     }
+
     const auto level{static_cast<u32>(std::distance(mipmap_offsets.begin(), mipmap_it))};
     return std::make_pair(layer, level);
 }
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index e614a92df..13dd16356 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -163,13 +163,11 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl
     return params;
 }
 
-SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-
+SurfaceParams SurfaceParams::CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d) {
+    const auto& regs = maxwell3d.regs;
     const auto block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U);
     const bool is_layered = regs.zeta_layers > 1 && block_depth == 0;
     const auto pixel_format = PixelFormatFromDepthFormat(regs.zeta.format);
-
     return {
         .is_tiled = regs.zeta.memory_layout.type ==
                     Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear,
@@ -191,8 +189,9 @@ SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) {
     };
 }
 
-SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::size_t index) {
-    const auto& config{system.GPU().Maxwell3D().regs.rt[index]};
+SurfaceParams SurfaceParams::CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d,
+                                                  std::size_t index) {
+    const auto& config{maxwell3d.regs.rt[index]};
     SurfaceParams params;
     params.is_tiled =
         config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
@@ -241,6 +240,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
         .is_tiled = is_tiled,
         .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB ||
                            config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB,
+        .is_layered = false,
         .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U,
         .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U,
         .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U,
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 118aa689e..4466c3c34 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -33,10 +33,11 @@ public:
                                         const VideoCommon::Shader::Image& entry);
 
     /// Creates SurfaceCachedParams for a depth buffer configuration.
-    static SurfaceParams CreateForDepthBuffer(Core::System& system);
+    static SurfaceParams CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d);
 
     /// Creates SurfaceCachedParams from a framebuffer configuration.
-    static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index);
+    static SurfaceParams CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d,
+                                              std::size_t index);
 
     /// Creates SurfaceCachedParams from a Fermi2D surface configuration.
     static SurfaceParams CreateForFermiCopySurface(
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 96c4e4cc2..ea835c59f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -135,8 +135,7 @@ public:
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
 
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
@@ -160,8 +159,7 @@ public:
         if (!gpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
@@ -183,11 +181,11 @@ public:
 
     TView GetDepthBufferSurface(bool preserve_contents) {
         std::lock_guard lock{mutex};
-        auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
+        auto& dirty = maxwell3d.dirty;
+        if (!dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
             return depth_buffer.view;
         }
-        maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false;
+        dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false;
 
         const auto& regs{maxwell3d.regs};
         const auto gpu_addr{regs.zeta.Address()};
@@ -195,13 +193,12 @@ public:
             SetEmptyDepthBuffer();
             return {};
         }
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             SetEmptyDepthBuffer();
             return {};
         }
-        const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)};
+        const auto depth_params{SurfaceParams::CreateForDepthBuffer(maxwell3d)};
         auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true);
         if (depth_buffer.target)
             depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
@@ -215,7 +212,6 @@ public:
     TView GetColorBufferSurface(std::size_t index, bool preserve_contents) {
         std::lock_guard lock{mutex};
         ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
-        auto& maxwell3d = system.GPU().Maxwell3D();
         if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) {
             return render_targets[index].view;
         }
@@ -235,15 +231,14 @@ public:
             return {};
         }
 
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             SetEmptyColorBuffer(index);
             return {};
         }
 
         auto surface_view =
-            GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
+            GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(maxwell3d, index),
                        preserve_contents, true);
         if (render_targets[index].target) {
             auto& surface = render_targets[index].target;
@@ -300,9 +295,8 @@ public:
         const GPUVAddr dst_gpu_addr = dst_config.Address();
         DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
 
-        const auto& memory_manager = system.GPU().MemoryManager();
-        const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
-        const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
+        const std::optional<VAddr> dst_cpu_addr = gpu_memory.GpuToCpuAddress(dst_gpu_addr);
+        const std::optional<VAddr> src_cpu_addr = gpu_memory.GpuToCpuAddress(src_gpu_addr);
         std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
         TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
         ImageBlit(src_surface, dst_surface.second, copy_config);
@@ -358,9 +352,11 @@ public:
     }
 
 protected:
-    explicit TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                          bool is_astc_supported)
-        : system{system}, is_astc_supported{is_astc_supported}, rasterizer{rasterizer} {
+    explicit TextureCache(VideoCore::RasterizerInterface& rasterizer_,
+                          Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_,
+                          bool is_astc_supported_)
+        : is_astc_supported{is_astc_supported_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+          gpu_memory{gpu_memory_} {
         for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
             SetEmptyColorBuffer(i);
         }
@@ -395,7 +391,7 @@ protected:
     virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;
 
     void ManageRenderTargetUnregister(TSurface& surface) {
-        auto& dirty = system.GPU().Maxwell3D().dirty;
+        auto& dirty = maxwell3d.dirty;
         const u32 index = surface->GetRenderTarget();
         if (index == DEPTH_RT) {
             dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true;
@@ -408,8 +404,7 @@ protected:
     void Register(TSurface surface) {
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const std::size_t size = surface->GetSizeInBytes();
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             LOG_CRITICAL(HW_GPU, "Failed to register surface with unmapped gpu_address 0x{:016x}",
                          gpu_addr);
@@ -459,7 +454,6 @@ protected:
         return new_surface;
     }
 
-    Core::System& system;
     const bool is_astc_supported;
 
 private:
@@ -954,8 +948,7 @@ private:
      * @param params   The parameters on the candidate surface.
      **/
     Deduction DeduceSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) {
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
 
         if (!cpu_addr) {
             Deduction result{};
@@ -1112,7 +1105,7 @@ private:
 
     void LoadSurface(const TSurface& surface) {
         staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
-        surface->LoadBuffer(system.GPU().MemoryManager(), staging_cache);
+        surface->LoadBuffer(gpu_memory, staging_cache);
         surface->UploadTexture(staging_cache.GetBuffer(0));
         surface->MarkAsModified(false, Tick());
     }
@@ -1123,7 +1116,7 @@ private:
         }
         staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
         surface->DownloadTexture(staging_cache.GetBuffer(0));
-        surface->FlushBuffer(system.GPU().MemoryManager(), staging_cache);
+        surface->FlushBuffer(gpu_memory, staging_cache);
         surface->MarkAsModified(false, Tick());
     }
 
@@ -1253,6 +1246,8 @@ private:
     }
 
     VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::MemoryManager& gpu_memory;
 
     FormatLookupTable format_lookup_table;
     FormatCompatibility format_compatibility;
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 4e3a092c7..dd5cee4a1 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -21,14 +21,17 @@ namespace {
 std::unique_ptr<VideoCore::RendererBase> CreateRenderer(
     Core::System& system, Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
     std::unique_ptr<Core::Frontend::GraphicsContext> context) {
+    auto& telemetry_session = system.TelemetrySession();
+    auto& cpu_memory = system.Memory();
+
     switch (Settings::values.renderer_backend.GetValue()) {
     case Settings::RendererBackend::OpenGL:
-        return std::make_unique<OpenGL::RendererOpenGL>(system, emu_window, gpu,
-                                                        std::move(context));
+        return std::make_unique<OpenGL::RendererOpenGL>(telemetry_session, emu_window, cpu_memory,
+                                                        gpu, std::move(context));
 #ifdef HAS_VULKAN
     case Settings::RendererBackend::Vulkan:
-        return std::make_unique<Vulkan::RendererVulkan>(system, emu_window, gpu,
-                                                        std::move(context));
+        return std::make_unique<Vulkan::RendererVulkan>(telemetry_session, emu_window, cpu_memory,
+                                                        gpu, std::move(context));
 #endif
     default:
         return nullptr;
@@ -41,10 +44,11 @@ namespace VideoCore {
 
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
     std::unique_ptr<Tegra::GPU> gpu;
+    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
     if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
-        gpu = std::make_unique<VideoCommon::GPUAsynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);
     } else {
-        gpu = std::make_unique<VideoCommon::GPUSynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);
     }
 
     auto context = emu_window.CreateSharedContext();