220 files changed, 20450 insertions, 9452 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 258d58eba..abcee2a1c 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,13 +1,37 @@
+add_subdirectory(host_shaders)
+
 add_library(video_core STATIC
     buffer_cache/buffer_block.h
     buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    cdma_pusher.cpp
+    cdma_pusher.h
+    command_classes/codecs/codec.cpp
+    command_classes/codecs/codec.h
+    command_classes/codecs/h264.cpp
+    command_classes/codecs/h264.h
+    command_classes/codecs/vp9.cpp
+    command_classes/codecs/vp9.h
+    command_classes/codecs/vp9_types.h
+    command_classes/host1x.cpp
+    command_classes/host1x.h
+    command_classes/nvdec.cpp
+    command_classes/nvdec.h
+    command_classes/nvdec_common.h
+    command_classes/sync_manager.cpp
+    command_classes/sync_manager.h
+    command_classes/vic.cpp
+    command_classes/vic.h
+    compatible_formats.cpp
+    compatible_formats.h
     dirty_flags.cpp
     dirty_flags.h
     dma_pusher.cpp
     dma_pusher.h
     engines/const_buffer_engine_interface.h
     engines/const_buffer_info.h
+    engines/engine_interface.h
     engines/engine_upload.cpp
     engines/engine_upload.h
     engines/fermi_2d.cpp
@@ -23,6 +47,15 @@ add_library(video_core STATIC
     engines/shader_bytecode.h
     engines/shader_header.h
     engines/shader_type.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
+    fence_manager.h
     gpu.cpp
     gpu.h
     gpu_asynch.cpp
@@ -33,8 +66,6 @@ add_library(video_core STATIC
     gpu_thread.h
     guest_driver.cpp
     guest_driver.h
-    macro_interpreter.cpp
-    macro_interpreter.h
     memory_manager.cpp
     memory_manager.h
     morton.cpp
@@ -42,15 +73,17 @@ add_library(video_core STATIC
     query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
-    rasterizer_cache.cpp
-    rasterizer_cache.h
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
     renderer_opengl/gl_device.cpp
     renderer_opengl/gl_device.h
+    renderer_opengl/gl_fence_manager.cpp
+    renderer_opengl/gl_fence_manager.h
     renderer_opengl/gl_framebuffer_cache.cpp
     renderer_opengl/gl_framebuffer_cache.h
     renderer_opengl/gl_rasterizer.cpp
@@ -84,6 +117,9 @@ add_library(video_core STATIC
     renderer_opengl/utils.h
     sampler_cache.cpp
     sampler_cache.h
+    shader_cache.h
+    shader_notify.cpp
+    shader_notify.h
     shader/decode/arithmetic.cpp
     shader/decode/arithmetic_immediate.cpp
     shader/decode/bfe.cpp
@@ -114,6 +150,8 @@ add_library(video_core STATIC
     shader/decode/other.cpp
     shader/ast.cpp
     shader/ast.h
+    shader/async_shaders.cpp
+    shader/async_shaders.h
     shader/compiler_settings.cpp
     shader/compiler_settings.h
     shader/control_flow.cpp
@@ -121,6 +159,8 @@ add_library(video_core STATIC
     shader/decode.cpp
     shader/expr.cpp
     shader/expr.h
+    shader/memory_util.cpp
+    shader/memory_util.h
     shader/node_helper.cpp
     shader/node_helper.h
     shader/node.h
@@ -160,12 +200,16 @@ if (ENABLE_VULKAN)
         renderer_vulkan/fixed_pipeline_state.h
         renderer_vulkan/maxwell_to_vk.cpp
         renderer_vulkan/maxwell_to_vk.h
+        renderer_vulkan/nsight_aftermath_tracker.cpp
+        renderer_vulkan/nsight_aftermath_tracker.h
         renderer_vulkan/renderer_vulkan.h
         renderer_vulkan/renderer_vulkan.cpp
         renderer_vulkan/vk_blit_screen.cpp
         renderer_vulkan/vk_blit_screen.h
         renderer_vulkan/vk_buffer_cache.cpp
         renderer_vulkan/vk_buffer_cache.h
+        renderer_vulkan/vk_command_pool.cpp
+        renderer_vulkan/vk_command_pool.h
         renderer_vulkan/vk_compute_pass.cpp
         renderer_vulkan/vk_compute_pass.h
         renderer_vulkan/vk_compute_pipeline.cpp
@@ -174,10 +218,14 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_descriptor_pool.h
         renderer_vulkan/vk_device.cpp
         renderer_vulkan/vk_device.h
+        renderer_vulkan/vk_fence_manager.cpp
+        renderer_vulkan/vk_fence_manager.h
         renderer_vulkan/vk_graphics_pipeline.cpp
         renderer_vulkan/vk_graphics_pipeline.h
         renderer_vulkan/vk_image.cpp
         renderer_vulkan/vk_image.h
+        renderer_vulkan/vk_master_semaphore.cpp
+        renderer_vulkan/vk_master_semaphore.h
         renderer_vulkan/vk_memory_manager.cpp
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_pipeline_cache.cpp
@@ -188,8 +236,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_rasterizer.h
         renderer_vulkan/vk_renderpass_cache.cpp
         renderer_vulkan/vk_renderpass_cache.h
-        renderer_vulkan/vk_resource_manager.cpp
-        renderer_vulkan/vk_resource_manager.h
+        renderer_vulkan/vk_resource_pool.cpp
+        renderer_vulkan/vk_resource_pool.h
         renderer_vulkan/vk_sampler_cache.cpp
         renderer_vulkan/vk_sampler_cache.h
         renderer_vulkan/vk_scheduler.cpp
@@ -213,21 +261,55 @@ if (ENABLE_VULKAN)
         renderer_vulkan/wrapper.cpp
         renderer_vulkan/wrapper.h
     )
-
-    target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
-    target_compile_definitions(video_core PRIVATE HAS_VULKAN)
 endif()
 
 create_target_directory_groups(video_core)
 
 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad)
+target_link_libraries(video_core PRIVATE glad xbyak)
+
+if (MSVC)
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
+else()
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
+endif()
+
+add_dependencies(video_core host_shaders)
+target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
+
 if (ENABLE_VULKAN)
+    target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
+    target_compile_definitions(video_core PRIVATE HAS_VULKAN)
     target_link_libraries(video_core PRIVATE sirit)
 endif()
 
+if (ENABLE_NSIGHT_AFTERMATH)
+    if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
+        message(ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided")
+    endif()
+    if (NOT WIN32)
+        message(ERROR "Nsight Aftermath doesn't support non-Windows platforms")
+    endif()
+    target_compile_definitions(video_core PRIVATE HAS_NSIGHT_AFTERMATH)
+    target_include_directories(video_core PRIVATE "$ENV{NSIGHT_AFTERMATH_SDK}/include")
+endif()
+
 if (MSVC)
     target_compile_options(video_core PRIVATE /we4267)
 else()
-    target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion)
+    target_compile_options(video_core PRIVATE
+        -Werror=conversion
+        -Wno-error=sign-conversion
+        -Werror=pessimizing-move
+        -Werror=redundant-move
+        -Werror=switch
+        -Werror=type-limits
+        -Werror=unused-variable
+
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
+    )
 endif()
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
 
 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
         return cpu_addr <= other_start && other_end <= cpu_addr_end;
     }
 
-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
         return static_cast<std::size_t>(in_addr - cpu_addr);
     }
 
-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
         return cpu_addr;
     }
 
-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
         return cpu_addr_end;
     }
 
-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
         cpu_addr = new_addr;
         cpu_addr_end = new_addr + size;
     }
 
-    std::size_t GetSize() const {
+    std::size_t Size() const {
         return size;
     }
 
-    void SetEpoch(u64 new_epoch) {
-        epoch = new_epoch;
+    u64 Epoch() const {
+        return epoch;
     }
 
-    u64 GetEpoch() {
-        return epoch;
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
     }
 
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
     }
-    ~BufferBlock() = default;
 
 private:
     VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b57c0d4d4..e7edd733f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include <array>
+#include <list>
 #include <memory>
 #include <mutex>
 #include <unordered_map>
@@ -12,14 +12,17 @@
 #include <utility>
 #include <vector>
 
-#include <boost/icl/interval_map.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_set.hpp>
-#include <boost/range/iterator_range.hpp>
+#include <boost/intrusive/set.hpp>
 
 #include "common/alignment.h"
+#include "common/assert.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "core/core.h"
 #include "core/memory.h"
+#include "core/settings.h"
 #include "video_core/buffer_cache/buffer_block.h"
 #include "video_core/buffer_cache/map_interval.h"
 #include "video_core/memory_manager.h"
@@ -27,105 +30,122 @@
 
 namespace VideoCommon {
 
-using MapInterval = std::shared_ptr<MapIntervalBase>;
-
-template <typename TBuffer, typename TBufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
-    using BufferInfo = std::pair<const TBufferType*, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        const std::optional<VAddr> cpu_addr_opt =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
-
-        if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        if (!cpu_addr) {
+            return GetEmptyBuffer(size);
         }
 
-        VAddr cpu_addr = *cpu_addr_opt;
-
         // Cache management is a big overhead, so only cache entries with a given size.
         // TODO: Figure out which size is the best for given games.
         constexpr std::size_t max_stream_size = 0x800;
         if (use_fast_cbuf || size < max_stream_size) {
-            if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                auto& memory_manager = system.GPU().MemoryManager();
+            if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
+                const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
                 if (use_fast_cbuf) {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return ConstBufferUpload(host_ptr, size);
+                    u8* dest;
+                    if (is_granular) {
+                        dest = gpu_memory.GetPointer(gpu_addr);
                     } else {
                         staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return ConstBufferUpload(staging_buffer.data(), size);
+                        dest = staging_buffer.data();
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
                     }
+                    return ConstBufferUpload(dest, size);
+                }
+                if (is_granular) {
+                    u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
+                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+                        std::memcpy(dest, host_ptr, size);
+                    });
                 } else {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return StreamBufferUpload(host_ptr, size, alignment);
-                    } else {
-                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return StreamBufferUpload(staging_buffer.data(), size, alignment);
-                    }
+                    return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
+                    });
                 }
             }
         }
 
-        auto block = GetBlock(cpu_addr, size);
-        auto map = MapAddress(block, gpu_addr, cpu_addr, size);
+        Buffer* const block = GetBlock(*cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
+        if (!map) {
+            return GetEmptyBuffer(size);
+        }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
-            if (!map->IsWritten()) {
-                map->MarkAsWritten(true);
-                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+                MarkForAsyncFlush(map);
             }
-        } else {
-            if (map->IsWritten()) {
-                WriteBarrier();
+            if (!map->is_written) {
+                map->is_written = true;
+                MarkRegionAsWritten(map->start, map->end - 1);
             }
         }
 
-        const u64 offset = static_cast<u64>(block->GetOffset(cpu_addr));
-
-        return {ToHandle(block), offset};
+        return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
     BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
                                 std::size_t alignment = 4) {
         std::lock_guard lock{mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
+        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+            std::memcpy(dest, raw_pointer, size);
+        });
     }
 
-    void Map(std::size_t max_size) {
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    bool Map(std::size_t max_size) {
         std::lock_guard lock{mutex};
 
+        bool invalidated;
         std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
         buffer_offset = buffer_offset_base;
+
+        return invalidated;
     }
 
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
+    /// Finishes the upload stream
+    void Unmap() {
         std::lock_guard lock{mutex};
-
         stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
     }
 
+    /// Function called at the end of each frame, inteded for deferred operations
     void TickFrame() {
         ++epoch;
+
         while (!pending_destruction.empty()) {
             // Delay at least 4 frames before destruction.
             // This is due to triple buffering happening on some drivers.
             static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                 break;
             }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
         }
     }
 
@@ -133,117 +153,193 @@ public:
     void FlushRegion(VAddr addr, std::size_t size) {
         std::lock_guard lock{mutex};
 
-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
-            return a->GetModificationTick() < b->GetModificationTick();
-        });
-        for (auto& object : objects) {
-            if (object->IsModified() && object->IsRegistered()) {
+        VectorMapInterval objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(),
+                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
+        for (MapInterval* object : objects) {
+            if (object->is_modified && object->is_registered) {
+                mutex.unlock();
                 FlushMap(object);
+                mutex.lock();
             }
         }
     }
 
+    bool MustFlushRegion(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        const VectorMapInterval objects = GetMapsInRange(addr, size);
+        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
+            return map->is_modified && map->is_registered;
+        });
+    }
+
     /// Mark the specified region as being invalidated
     void InvalidateRegion(VAddr addr, u64 size) {
         std::lock_guard lock{mutex};
 
-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        for (auto& object : objects) {
-            if (object->IsRegistered()) {
+        for (auto& object : GetMapsInRange(addr, size)) {
+            if (object->is_registered) {
                 Unregister(object);
             }
         }
     }
 
-    virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
 
-protected:
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+        for (MapInterval* object : GetMapsInRange(addr, size)) {
+            if (object->is_memory_marked && object->is_registered) {
+                UnmarkMemory(object);
+                object->is_sync_pending = true;
+                marked_for_unregister.emplace_back(object);
+            }
+        }
+    }
 
-    ~BufferCache() = default;
+    void SyncGuestHost() {
+        std::lock_guard lock{mutex};
+
+        for (auto& object : marked_for_unregister) {
+            if (object->is_registered) {
+                object->is_sync_pending = false;
+                Unregister(object);
+            }
+        }
+        marked_for_unregister.clear();
+    }
+
+    void CommitAsyncFlushes() {
+        if (uncommitted_flushes) {
+            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+            for (MapInterval* map : *uncommitted_flushes) {
+                if (map->is_registered && map->is_modified) {
+                    // TODO(Blinkhawk): Implement backend asynchronous flushing
+                    // AsyncFlushMap(map)
+                    commit_list->push_back(map);
+                }
+            }
+            if (!commit_list->empty()) {
+                committed_flushes.push_back(commit_list);
+            } else {
+                committed_flushes.emplace_back();
+            }
+        } else {
+            committed_flushes.emplace_back();
+        }
+        uncommitted_flushes.reset();
+    }
 
-    virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
+    bool ShouldWaitAsyncFlushes() const {
+        return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+    }
 
-    virtual void WriteBarrier() = 0;
+    bool HasUncommittedFlushes() const {
+        return uncommitted_flushes != nullptr;
+    }
+
+    void PopAsyncFlushes() {
+        if (committed_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = committed_flushes.front();
+        if (!flush_list) {
+            committed_flushes.pop_front();
+            return;
+        }
+        for (MapInterval* map : *flush_list) {
+            if (map->is_registered) {
+                // TODO(Blinkhawk): Replace this for reading the asynchronous flush
+                FlushMap(map);
+            }
+        }
+        committed_flushes.pop_front();
+    }
 
-    virtual TBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
-    virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
+protected:
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                         std::unique_ptr<StreamBuffer> stream_buffer_)
+        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
+          stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {}
 
-    virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
+    ~BufferCache() = default;
 
-    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
         return {};
     }
 
     /// Register an object into the cache
-    void Register(const MapInterval& new_map, bool inherit_written = false) {
-        const VAddr cpu_addr = new_map->GetStart();
+    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
+        const VAddr cpu_addr = new_map.start;
         if (!cpu_addr) {
             LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
-                         new_map->GetGpuAddress());
-            return;
+                         new_map.gpu_addr);
+            return nullptr;
         }
-        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
-        new_map->MarkAsRegistered(true);
-        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
-        mapped_addresses.insert({interval, new_map});
+        const std::size_t size = new_map.end - new_map.start;
+        new_map.is_registered = true;
         rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
+        new_map.is_memory_marked = true;
         if (inherit_written) {
-            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
-            new_map->MarkAsWritten(true);
+            MarkRegionAsWritten(new_map.start, new_map.end - 1);
+            new_map.is_written = true;
         }
+        MapInterval* const storage = mapped_addresses_allocator.Allocate();
+        *storage = new_map;
+        mapped_addresses.insert(*storage);
+        return storage;
     }
 
-    /// Unregisters an object from the cache
-    void Unregister(MapInterval& map) {
-        const std::size_t size = map->GetEnd() - map->GetStart();
-        rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
-        map->MarkAsRegistered(false);
-        if (map->IsWritten()) {
-            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+    void UnmarkMemory(MapInterval* map) {
+        if (!map->is_memory_marked) {
+            return;
         }
-        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
-        mapped_addresses.erase(delete_interval);
+        const std::size_t size = map->end - map->start;
+        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
+        map->is_memory_marked = false;
     }
 
-private:
-    MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
-        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
+    /// Unregisters an object from the cache
+    void Unregister(MapInterval* map) {
+        UnmarkMemory(map);
+        map->is_registered = false;
+        if (map->is_sync_pending) {
+            map->is_sync_pending = false;
+            marked_for_unregister.remove(map);
+        }
+        if (map->is_written) {
+            UnmarkRegionAsWritten(map->start, map->end - 1);
+        }
+        const auto it = mapped_addresses.find(*map);
+        ASSERT(it != mapped_addresses.end());
+        mapped_addresses.erase(it);
+        mapped_addresses_allocator.Release(map);
     }
 
-    MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
-                           const std::size_t size) {
-
-        std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
+private:
+    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
+        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
-            auto& memory_manager = system.GPU().MemoryManager();
             const VAddr cpu_addr_end = cpu_addr + size;
-            MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
-            if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+            if (gpu_memory.IsGranularRange(gpu_addr, size)) {
+                u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
-                memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
-            Register(new_map);
-            return new_map;
+            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
 
         const VAddr cpu_addr_end = cpu_addr + size;
         if (overlaps.size() == 1) {
-            MapInterval& current_map = overlaps[0];
+            MapInterval* const current_map = overlaps[0];
             if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
                 return current_map;
             }
@@ -253,57 +349,70 @@ private:
         bool write_inheritance = false;
         bool modified_inheritance = false;
         // Calculate new buffer parameters
-        for (auto& overlap : overlaps) {
-            new_start = std::min(overlap->GetStart(), new_start);
-            new_end = std::max(overlap->GetEnd(), new_end);
-            write_inheritance |= overlap->IsWritten();
-            modified_inheritance |= overlap->IsModified();
+        for (MapInterval* overlap : overlaps) {
+            new_start = std::min(overlap->start, new_start);
+            new_end = std::max(overlap->end, new_end);
+            write_inheritance |= overlap->is_written;
+            modified_inheritance |= overlap->is_modified;
         }
         GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
         for (auto& overlap : overlaps) {
             Unregister(overlap);
         }
         UpdateBlock(block, new_start, new_end, overlaps);
-        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+
+        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+        MapInterval* const map = Register(new_map, write_inheritance);
+        if (!map) {
+            return nullptr;
+        }
         if (modified_inheritance) {
-            new_map->MarkAsModified(true, GetModifiedTicks());
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+                MarkForAsyncFlush(map);
+            }
         }
-        Register(new_map, write_inheritance);
-        return new_map;
+        return map;
     }
 
-    void UpdateBlock(const TBuffer& block, VAddr start, VAddr end,
-                     std::vector<MapInterval>& overlaps) {
+    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
         interval_set.add(base_interval);
         for (auto& overlap : overlaps) {
-            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            const IntervalType subtract{overlap->start, overlap->end};
             interval_set.subtract(subtract);
         }
         for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
             }
+            staging_buffer.resize(size);
+            cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
-    std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
+    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+        VectorMapInterval result;
         if (size == 0) {
-            return {};
+            return result;
         }
 
-        std::vector<MapInterval> objects{};
-        const IntervalType interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
-            objects.push_back(pair.second);
+        const VAddr addr_end = addr + size;
+        auto it = mapped_addresses.lower_bound(addr);
+        if (it != mapped_addresses.begin()) {
+            --it;
         }
-
-        return objects;
+        while (it != mapped_addresses.end() && it->start < addr_end) {
+            if (it->Overlaps(addr, addr_end)) {
+                result.push_back(&*it);
+            }
+            ++it;
+        }
+        return result;
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -311,24 +420,28 @@ private:
         return ++modified_ticks;
     }
 
-    void FlushMap(MapInterval map) {
-        std::size_t size = map->GetEnd() - map->GetStart();
-        TBuffer block = blocks[map->GetStart() >> block_page_bits];
+    void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
+        const std::size_t size = map->end - map->start;
         staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
-        system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
+        cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
 
-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
+    template <typename Callable>
+    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
         AlignBuffer(alignment);
         const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
+        callable(buffer_ptr);
 
         buffer_ptr += size;
         buffer_offset += size;
-        return {&stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
     }
 
     void AlignBuffer(std::size_t alignment) {
@@ -338,151 +451,148 @@ private:
         buffer_offset = offset_aligned;
     }
 
-    TBuffer EnlargeBlock(TBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        TBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
+
         return new_buffer;
     }
 
-    TBuffer MergeBlocks(TBuffer first, TBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
         const VAddr new_addr = std::min(first_addr, second_addr);
         const std::size_t new_size = size_1 + size_2;
-        TBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
         const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
         return new_buffer;
     }
 
-    TBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        TBuffer found{};
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
         const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
             auto it = blocks.find(page_start);
             if (it == blocks.end()) {
                 if (found) {
                     found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                 }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
             }
-            ++page_start;
         }
-        return found;
+        return found.get();
     }
 
-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
-            auto it = written_pages.find(page_start);
-            if (it != written_pages.end()) {
-                it->second = it->second + 1;
-            } else {
-                written_pages[page_start] = 1;
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+            if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
+                ++it->second;
             }
-            page_start++;
         }
     }
 
-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 if (it->second > 1) {
-                    it->second = it->second - 1;
+                    --it->second;
                 } else {
                     written_pages.erase(it);
                 }
             }
-            page_start++;
         }
     }
 
-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             if (written_pages.count(page_start) > 0) {
                 return true;
             }
-            page_start++;
         }
         return false;
     }
 
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
+    void MarkForAsyncFlush(MapInterval* map) {
+        if (!uncommitted_flushes) {
+            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
+        }
+        uncommitted_flushes->insert(map);
+    }
+
     VideoCore::RasterizerInterface& rasterizer;
-    Core::System& system;
+    Tegra::MemoryManager& gpu_memory;
+    Core::Memory::Memory& cpu_memory;
 
     std::unique_ptr<StreamBuffer> stream_buffer;
-    TBufferType stream_buffer_handle{};
-
-    bool invalidated = false;
+    BufferType stream_buffer_handle;
 
     u8* buffer_ptr = nullptr;
     u64 buffer_offset = 0;
     u64 buffer_offset_base = 0;
 
-    using IntervalSet = boost::icl::interval_set<VAddr>;
-    using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
-    using IntervalType = typename IntervalCache::interval_type;
-    IntervalCache mapped_addresses;
+    MapIntervalAllocator mapped_addresses_allocator;
+    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+        mapped_addresses;
 
-    static constexpr u64 write_page_bit = 11;
     std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
 
-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, TBuffer> blocks;
-
-    std::list<TBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
     u64 epoch = 0;
     u64 modified_ticks = 0;
 
     std::vector<u8> staging_buffer;
 
+    std::list<MapInterval*> marked_for_unregister;
+
+    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
+    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
+
     std::recursive_mutex mutex;
 };
 
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
new file mode 100644
index 000000000..62587e18a
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "video_core/buffer_cache/map_interval.h"
+
+namespace VideoCommon {
+
+MapIntervalAllocator::MapIntervalAllocator() {
+    FillFreeList(first_chunk);
+}
+
+MapIntervalAllocator::~MapIntervalAllocator() = default;
+
+void MapIntervalAllocator::AllocateNewChunk() {
+    *new_chunk = std::make_unique<Chunk>();
+    FillFreeList(**new_chunk);
+    new_chunk = &(*new_chunk)->next;
+}
+
+void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
+    const std::size_t old_size = free_list.size();
+    free_list.resize(old_size + chunk.data.size());
+    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
+                   [](MapInterval& interval) { return &interval; });
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
index b0956029d..fe0bcd1d8 100644
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,86 +4,89 @@
 
 #pragma once
 
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <boost/intrusive/set_hook.hpp>
+
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 
 namespace VideoCommon {
 
-class MapIntervalBase {
-public:
-    MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
-        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
+    MapInterval() = default;
 
-    void SetCpuAddress(VAddr new_cpu_addr) {
-        cpu_addr = new_cpu_addr;
-    }
+    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
 
-    VAddr GetCpuAddress() const {
-        return cpu_addr;
-    }
+    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
+        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
 
-    GPUVAddr GetGpuAddress() const {
-        return gpu_addr;
+    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
+        return start <= other_start && other_end <= end;
     }
 
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
-        return (start <= other_start && other_end <= end);
+    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
+        return start < other_end && other_start < end;
     }
 
-    bool operator==(const MapIntervalBase& rhs) const {
-        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
-    }
-
-    bool operator!=(const MapIntervalBase& rhs) const {
-        return !operator==(rhs);
-    }
-
-    void MarkAsRegistered(const bool registered) {
-        is_registered = registered;
+    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
+        is_modified = is_modified_;
+        ticks = ticks_;
     }
 
-    bool IsRegistered() const {
-        return is_registered;
-    }
+    boost::intrusive::set_member_hook<> member_hook_;
+    VAddr start = 0;
+    VAddr end = 0;
+    GPUVAddr gpu_addr = 0;
+    u64 ticks = 0;
+    bool is_written = false;
+    bool is_modified = false;
+    bool is_registered = false;
+    bool is_memory_marked = false;
+    bool is_sync_pending = false;
+};
 
-    VAddr GetStart() const {
-        return start;
+struct MapIntervalCompare {
+    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
+        return lhs.start < rhs.start;
     }
+};
 
-    VAddr GetEnd() const {
-        return end;
+class MapIntervalAllocator {
+public:
+    MapIntervalAllocator();
+    ~MapIntervalAllocator();
+
+    MapInterval* Allocate() {
+        if (free_list.empty()) {
+            AllocateNewChunk();
+        }
+        MapInterval* const interval = free_list.back();
+        free_list.pop_back();
+        return interval;
     }
 
-    void MarkAsModified(const bool is_modified_, const u64 tick) {
-        is_modified = is_modified_;
-        ticks = tick;
+    void Release(MapInterval* interval) {
+        free_list.push_back(interval);
     }
 
-    bool IsModified() const {
-        return is_modified;
-    }
+private:
+    struct Chunk {
+        std::unique_ptr<Chunk> next;
+        std::array<MapInterval, 0x8000> data;
+    };
 
-    u64 GetModificationTick() const {
-        return ticks;
-    }
+    void AllocateNewChunk();
 
-    void MarkAsWritten(const bool is_written_) {
-        is_written = is_written_;
-    }
+    void FillFreeList(Chunk& chunk);
 
-    bool IsWritten() const {
-        return is_written;
-    }
+    std::vector<MapInterval*> free_list;
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
 
-private:
-    VAddr start;
-    VAddr end;
-    GPUVAddr gpu_addr;
-    VAddr cpu_addr{};
-    bool is_written{};
-    bool is_modified{};
-    bool is_registered{};
-    u64 ticks{};
+    Chunk first_chunk;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
new file mode 100644
index 000000000..b60f86260
--- /dev/null
+++ b/src/video_core/cdma_pusher.cpp
@@ -0,0 +1,171 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "command_classes/host1x.h"
+#include "command_classes/nvdec.h"
+#include "command_classes/vic.h"
+#include "common/bit_util.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+CDmaPusher::CDmaPusher(GPU& gpu)
+    : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)),
+      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x>(gpu)),
+      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
+      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
+
+CDmaPusher::~CDmaPusher() = default;
+
+void CDmaPusher::Push(ChCommandHeaderList&& entries) {
+    cdma_queue.push(std::move(entries));
+}
+
+void CDmaPusher::DispatchCalls() {
+    while (!cdma_queue.empty()) {
+        Step();
+    }
+}
+
+void CDmaPusher::Step() {
+    const auto entries{cdma_queue.front()};
+    cdma_queue.pop();
+
+    std::vector<u32> values(entries.size());
+    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
+
+    for (const u32 value : values) {
+        if (mask != 0) {
+            const u32 lbs = Common::CountTrailingZeroes32(mask);
+            mask &= ~(1U << lbs);
+            ExecuteCommand(static_cast<u32>(offset + lbs), value);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(static_cast<u32>(offset), value);
+            if (incrementing) {
+                ++offset;
+            }
+            continue;
+        }
+        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value & 0x3f;
+            offset = (value >> 16) & 0xfff;
+            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value & 0xfff;
+            offset = (value >> 16) & 0xfff;
+            ExecuteCommand(static_cast<u32>(offset), data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
+        }
+    }
+}
+
+void CDmaPusher::ExecuteCommand(u32 offset, u32 data) {
+    switch (current_class) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, offset, {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                nvdec_sync->Increment(syncpoint_id);
+            } else {
+                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                nvdec_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(
+                static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                vic_sync->Increment(syncpoint_id);
+            } else {
+                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                vic_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0), data);
+            vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0),
+                                         {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Host1x:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data});
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
+    }
+}
+
+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset;
+    std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size());
+}
+
+} // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
new file mode 100644
index 000000000..982f309c5
--- /dev/null
+++ b/src/video_core/cdma_pusher.h
@@ -0,0 +1,138 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/sync_manager.h"
+
+namespace Tegra {
+
+class GPU;
+class Nvdec;
+class Vic;
+class Host1x;
+
+enum class ChSubmissionMode : u32 {
+    SetClass = 0,
+    Incrementing = 1,
+    NonIncrementing = 2,
+    Mask = 3,
+    Immediate = 4,
+    Restart = 5,
+    Gather = 6,
+};
+
+enum class ChClassId : u32 {
+    NoClass = 0x0,
+    Host1x = 0x1,
+    VideoEncodeMpeg = 0x20,
+    VideoEncodeNvEnc = 0x21,
+    VideoStreamingVi = 0x30,
+    VideoStreamingIsp = 0x32,
+    VideoStreamingIspB = 0x34,
+    VideoStreamingViI2c = 0x36,
+    GraphicsVic = 0x5d,
+    Graphics3D = 0x60,
+    GraphicsGpu = 0x61,
+    Tsec = 0xe0,
+    TsecB = 0xe1,
+    NvJpg = 0xc0,
+    NvDec = 0xf0
+};
+
+enum class ChMethod : u32 {
+    Empty = 0,
+    SetMethod = 0x10,
+    SetData = 0x11,
+};
+
+union ChCommandHeader {
+    u32 raw;
+    BitField<0, 16, u32> value;
+    BitField<16, 12, ChMethod> method_offset;
+    BitField<28, 4, ChSubmissionMode> submission_mode;
+};
+static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
+
+struct ChCommand {
+    ChClassId class_id{};
+    int method_offset{};
+    std::vector<u32> arguments;
+};
+
+using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>;
+using ChCommandList = std::vector<Tegra::ChCommand>;
+
+struct ThiRegisters {
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
+};
+
+enum class ThiMethod : u32 {
+    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
+    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
+    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
+};
+
+class CDmaPusher {
+public:
+    explicit CDmaPusher(GPU& gpu);
+    ~CDmaPusher();
+
+    /// Push NVDEC command buffer entries into queue
+    void Push(ChCommandHeaderList&& entries);
+
+    /// Process queued command buffer entries
+    void DispatchCalls();
+
+    /// Process one queue element
+    void Step();
+
+    /// Invoke command class devices to execute the command based on the current state
+    void ExecuteCommand(u32 offset, u32 data);
+
+private:
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments);
+
+    GPU& gpu;
+
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+    std::unique_ptr<Tegra::Vic> vic_processor;
+    std::unique_ptr<Tegra::Host1x> host1x_processor;
+    std::unique_ptr<SyncptIncrManager> nvdec_sync;
+    std::unique_ptr<SyncptIncrManager> vic_sync;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};
+
+    s32 count{};
+    s32 offset{};
+    s32 mask{};
+    bool incrementing{};
+
+    // Queue of command lists to be processed
+    std::queue<ChCommandHeaderList> cdma_queue;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
new file mode 100644
index 000000000..1adf3cd13
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -0,0 +1,115 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <fstream>
+#include <vector>
+#include "common/assert.h"
+#include "video_core/command_classes/codecs/codec.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+extern "C" {
+#include <libavutil/opt.h>
+}
+
+namespace Tegra {
+
+Codec::Codec(GPU& gpu_)
+    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    avcodec_send_packet(av_codec_ctx, nullptr);
+    avcodec_receive_frame(av_codec_ctx, av_frame);
+    avcodec_flush_buffers(av_codec_ctx);
+
+    av_frame_unref(av_frame);
+    av_free(av_frame);
+    avcodec_close(av_codec_ctx);
+}
+
+void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
+    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
+    current_codec = codec;
+}
+
+void Codec::StateWrite(u32 offset, u64 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
+    std::memcpy(state_offset, &arguments, sizeof(u64));
+}
+
+void Codec::Decode() {
+    bool is_first_frame = false;
+
+    if (!initialized) {
+        if (current_codec == NvdecCommon::VideoCodec::H264) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
+        } else {
+            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec));
+            return;
+        }
+
+        av_codec_ctx = avcodec_alloc_context3(av_codec);
+        av_frame = av_frame_alloc();
+        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+
+        // TODO(ameerj): libavcodec gpu hw acceleration
+
+        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
+        if (av_error < 0) {
+            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
+            av_frame_unref(av_frame);
+            av_free(av_frame);
+            avcodec_close(av_codec_ctx);
+            return;
+        }
+        initialized = true;
+        is_first_frame = true;
+    }
+    bool vp9_hidden_frame = false;
+
+    AVPacket packet{};
+    av_init_packet(&packet);
+    std::vector<u8> frame_data;
+
+    if (current_codec == NvdecCommon::VideoCodec::H264) {
+        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
+    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+        frame_data = vp9_decoder->ComposeFrameHeader(state);
+        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
+    }
+
+    packet.data = frame_data.data();
+    packet.size = static_cast<int>(frame_data.size());
+
+    avcodec_send_packet(av_codec_ctx, &packet);
+
+    if (!vp9_hidden_frame) {
+        // Only receive/store visible frames
+        avcodec_receive_frame(av_codec_ctx, av_frame);
+    }
+}
+
+AVFrame* Codec::GetCurrentFrame() {
+    return av_frame;
+}
+
+const AVFrame* Codec::GetCurrentFrame() const {
+    return av_frame;
+}
+
+NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
+    return current_codec;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
new file mode 100644
index 000000000..5bbe6a332
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -0,0 +1,66 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#include <libavcodec/avcodec.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace Tegra {
+class GPU;
+struct VicRegisters;
+
+namespace Decoder {
+class H264;
+class VP9;
+} // namespace Decoder
+
+class Codec {
+public:
+    explicit Codec(GPU& gpu);
+    ~Codec();
+
+    /// Sets NVDEC video stream codec
+    void SetTargetCodec(NvdecCommon::VideoCodec codec);
+
+    /// Populate NvdecRegisters state with argument value at the provided offset
+    void StateWrite(u32 offset, u64 arguments);
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    /// Returns most recently decoded frame
+    [[nodiscard]] AVFrame* GetCurrentFrame();
+    [[nodiscard]] const AVFrame* GetCurrentFrame() const;
+
+    /// Returns the value of current_codec
+    [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
+
+private:
+    bool initialized{};
+    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
+
+    AVCodec* av_codec{nullptr};
+    AVCodecContext* av_codec_ctx{nullptr};
+    AVFrame* av_frame{nullptr};
+
+    GPU& gpu;
+    std::unique_ptr<Decoder::H264> h264_decoder;
+    std::unique_ptr<Decoder::VP9> vp9_decoder;
+
+    NvdecCommon::NvdecRegisters state{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
new file mode 100644
index 000000000..33e063e20
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -0,0 +1,293 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <array>
+#include "common/bit_util.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// ZigZag LUTs from libavcodec.
+constexpr std::array<u8, 64> zig_zag_direct{
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
+    41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
+    30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+constexpr std::array<u8, 16> zig_zag_scan{
+    0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+    1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
+};
+} // Anonymous namespace
+
+H264::H264(GPU& gpu_) : gpu(gpu_) {}
+
+H264::~H264() = default;
+
+const std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                bool is_first_frame) {
+    H264DecoderContext context{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+
+    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
+    if (!is_first_frame && frame_number != 0) {
+        frame.resize(context.frame_data_size);
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+    } else {
+        /// Encode header
+        H264BitWriter writer{};
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(7, 5);
+        writer.WriteU(100, 8);
+        writer.WriteU(0, 8);
+        writer.WriteU(31, 8);
+        writer.WriteUe(0);
+        const auto chroma_format_idc =
+            static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
+        writer.WriteUe(chroma_format_idc);
+        if (chroma_format_idc == 3) {
+            writer.WriteBit(false);
+        }
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+        writer.WriteBit(false); // Scaling matrix present flag
+
+        const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
+        writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
+        writer.WriteUe(order_cnt_type);
+        if (order_cnt_type == 0) {
+            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
+        } else if (order_cnt_type == 1) {
+            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+
+            writer.WriteSe(0);
+            writer.WriteSe(0);
+            writer.WriteUe(0);
+        }
+
+        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
+                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+
+        writer.WriteUe(16);
+        writer.WriteBit(false);
+        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+        writer.WriteUe(pic_height - 1);
+        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+
+        if (!context.h264_parameter_set.frame_mbs_only_flag) {
+            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
+        }
+
+        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
+        writer.WriteBit(false); // Frame cropping flag
+        writer.WriteBit(false); // VUI parameter present flag
+
+        writer.End();
+
+        // H264 PPS
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(8, 5);
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+
+        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
+        writer.WriteBit(false);
+        writer.WriteUe(0);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
+        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
+        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
+        pic_init_qp = (pic_init_qp << 26) >> 26;
+        writer.WriteSe(pic_init_qp);
+        writer.WriteSe(0);
+        s32 chroma_qp_index_offset =
+            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
+        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset);
+        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
+        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+
+        writer.WriteBit(true);
+
+        for (s32 index = 0; index < 6; index++) {
+            writer.WriteBit(true);
+            const auto matrix_x4 =
+                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
+            writer.WriteScalingList(matrix_x4, index * 16, 16);
+        }
+
+        if (context.h264_parameter_set.transform_8x8_mode_flag) {
+            for (s32 index = 0; index < 2; index++) {
+                writer.WriteBit(true);
+                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
+                                                       context.scaling_matrix_8.end());
+
+                writer.WriteScalingList(matrix_x8, index * 64, 64);
+            }
+        }
+
+        s32 chroma_qp_index_offset2 =
+            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
+        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset2);
+
+        writer.End();
+
+        const auto& encoded_header = writer.GetByteArray();
+        frame.resize(encoded_header.size() + context.frame_data_size);
+        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
+                                      frame.data() + encoded_header.size(),
+                                      context.frame_data_size);
+    }
+
+    return frame;
+}
+
+H264BitWriter::H264BitWriter() = default;
+
+H264BitWriter::~H264BitWriter() = default;
+
+void H264BitWriter::WriteU(s32 value, s32 value_sz) {
+    WriteBits(value, value_sz);
+}
+
+void H264BitWriter::WriteSe(s32 value) {
+    WriteExpGolombCodedInt(value);
+}
+
+void H264BitWriter::WriteUe(u32 value) {
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::End() {
+    WriteBit(true);
+    Flush();
+}
+
+void H264BitWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
+    std::vector<u8> scan(count);
+    if (count == 16) {
+        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
+    } else {
+        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
+    }
+    u8 last_scale = 8;
+
+    for (s32 index = 0; index < count; index++) {
+        const u8 value = list[start + scan[index]];
+        const s32 delta_scale = static_cast<s32>(value - last_scale);
+
+        WriteSe(delta_scale);
+
+        last_scale = value;
+    }
+}
+
+std::vector<u8>& H264BitWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& H264BitWriter::GetByteArray() const {
+    return byte_array;
+}
+
+void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
+    s32 value_pos = 0;
+
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free_bits = GetFreeBufferBits();
+
+        if (copy_size > free_bits) {
+            copy_size = free_bits;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
+    const s32 sign = value <= 0 ? 0 : 1;
+    if (value < 0) {
+        value = -value;
+    }
+    value = (value << 1) - sign;
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
+    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
+    WriteBits(1, size);
+
+    value -= (1U << (size - 1)) - 1;
+    WriteBits(static_cast<s32>(value), size - 1);
+}
+
+s32 H264BitWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void H264BitWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+
+    buffer = 0;
+    buffer_pos = 0;
+}
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h
new file mode 100644
index 000000000..273449495
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.h
@@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+namespace Decoder {
+
+class H264BitWriter {
+public:
+    H264BitWriter();
+    ~H264BitWriter();
+
+    /// The following Write methods are based on clause 9.1 in the H.264 specification.
+    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
+    void WriteU(s32 value, s32 value_sz);
+    void WriteSe(s32 value);
+    void WriteUe(u32 value);
+
+    /// Finalize the bitstream
+    void End();
+
+    /// append a bit to the stream, equivalent value to the state parameter
+    void WriteBit(bool state);
+
+    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
+    /// Writes the scaling matrices of the sream
+    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
+
+    /// Return the bitstream as a vector.
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    void WriteBits(s32 value, s32 bit_count);
+    void WriteExpGolombCodedInt(s32 value);
+    void WriteExpGolombCodedUInt(u32 value);
+    [[nodiscard]] s32 GetFreeBufferBits();
+    void Flush();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class H264 {
+public:
+    explicit H264(GPU& gpu);
+    ~H264();
+
+    /// Compose the H264 header of the frame for FFmpeg decoding
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                            bool is_first_frame = false);
+
+private:
+    struct H264ParameterSet {
+        u32 log2_max_pic_order_cnt{};
+        u32 delta_pic_order_always_zero_flag{};
+        u32 frame_mbs_only_flag{};
+        u32 pic_width_in_mbs{};
+        u32 pic_height_in_map_units{};
+        INSERT_PADDING_WORDS(1);
+        u32 entropy_coding_mode_flag{};
+        u32 bottom_field_pic_order_flag{};
+        u32 num_refidx_l0_default_active{};
+        u32 num_refidx_l1_default_active{};
+        u32 deblocking_filter_control_flag{};
+        u32 redundant_pic_count_flag{};
+        u32 transform_8x8_mode_flag{};
+        INSERT_PADDING_WORDS(9);
+        u64 flags{};
+        u32 frame_number{};
+        u32 frame_number2{};
+    };
+    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_BYTES(0x48);
+        u32 frame_data_size{};
+        INSERT_PADDING_BYTES(0xc);
+        H264ParameterSet h264_parameter_set{};
+        INSERT_PADDING_BYTES(0x100);
+        std::array<u8, 0x60> scaling_matrix_4;
+        std::array<u8, 0x80> scaling_matrix_8;
+    };
+    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
+
+    std::vector<u8> frame;
+    GPU& gpu;
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp
new file mode 100644
index 000000000..ab44fdc9e
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -0,0 +1,1040 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring> // for std::memcpy
+#include <numeric>
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// Default compressed header probabilities once frame context resets
+constexpr Vp9EntropyProbs default_probs{
+    .y_mode_prob{
+        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78,
+        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
+    },
+    .partition_prob{
+        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
+        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0,
+        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0,
+        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0,
+    },
+    .coef_probs{
+        195, 29,  183, 0, 84,  49,  136, 0, 8,   42,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 31,  107, 169, 0, 35,  99,  159, 0, 17,  82,  140, 0, 8,   66,  114, 0,
+        2,   44,  76,  0, 1,   19,  32,  0, 40,  132, 201, 0, 29,  114, 187, 0, 13,  91,  157, 0,
+        7,   75,  127, 0, 3,   58,  95,  0, 1,   28,  47,  0, 69,  142, 221, 0, 42,  122, 201, 0,
+        15,  91,  159, 0, 6,   67,  121, 0, 1,   42,  77,  0, 1,   17,  31,  0, 102, 148, 228, 0,
+        67,  117, 204, 0, 17,  82,  154, 0, 6,   59,  114, 0, 2,   39,  75,  0, 1,   15,  29,  0,
+        156, 57,  233, 0, 119, 57,  212, 0, 58,  48,  163, 0, 29,  40,  124, 0, 12,  30,  81,  0,
+        3,   12,  31,  0, 191, 107, 226, 0, 124, 117, 204, 0, 25,  99,  155, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 29,  148, 210, 0, 37,  126, 194, 0, 8,   93,  157, 0,
+        2,   68,  118, 0, 1,   39,  69,  0, 1,   17,  33,  0, 41,  151, 213, 0, 27,  123, 193, 0,
+        3,   82,  144, 0, 1,   58,  105, 0, 1,   32,  60,  0, 1,   13,  26,  0, 59,  159, 220, 0,
+        23,  126, 198, 0, 4,   88,  151, 0, 1,   66,  114, 0, 1,   38,  71,  0, 1,   18,  34,  0,
+        114, 136, 232, 0, 51,  114, 207, 0, 11,  83,  155, 0, 3,   56,  105, 0, 1,   33,  65,  0,
+        1,   17,  34,  0, 149, 65,  234, 0, 121, 57,  215, 0, 61,  49,  166, 0, 28,  36,  114, 0,
+        12,  25,  76,  0, 3,   16,  42,  0, 214, 49,  220, 0, 132, 63,  188, 0, 42,  65,  137, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  137, 221, 0, 104, 131, 216, 0,
+        49,  111, 192, 0, 21,  87,  155, 0, 2,   49,  87,  0, 1,   16,  28,  0, 89,  163, 230, 0,
+        90,  137, 220, 0, 29,  100, 183, 0, 10,  70,  135, 0, 2,   42,  81,  0, 1,   17,  33,  0,
+        108, 167, 237, 0, 55,  133, 222, 0, 15,  97,  179, 0, 4,   72,  135, 0, 1,   45,  85,  0,
+        1,   19,  38,  0, 124, 146, 240, 0, 66,  124, 224, 0, 17,  88,  175, 0, 4,   58,  122, 0,
+        1,   36,  75,  0, 1,   18,  37,  0, 141, 79,  241, 0, 126, 70,  227, 0, 66,  58,  182, 0,
+        30,  44,  136, 0, 12,  34,  96,  0, 2,   20,  47,  0, 229, 99,  249, 0, 143, 111, 235, 0,
+        46,  109, 192, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 82,  158, 236, 0,
+        94,  146, 224, 0, 25,  117, 191, 0, 9,   87,  149, 0, 3,   56,  99,  0, 1,   33,  57,  0,
+        83,  167, 237, 0, 68,  145, 222, 0, 10,  103, 177, 0, 2,   72,  131, 0, 1,   41,  79,  0,
+        1,   20,  39,  0, 99,  167, 239, 0, 47,  141, 224, 0, 10,  104, 178, 0, 2,   73,  133, 0,
+        1,   44,  85,  0, 1,   22,  47,  0, 127, 145, 243, 0, 71,  129, 228, 0, 17,  93,  177, 0,
+        3,   61,  124, 0, 1,   41,  84,  0, 1,   21,  52,  0, 157, 78,  244, 0, 140, 72,  231, 0,
+        69,  58,  184, 0, 31,  44,  137, 0, 14,  38,  105, 0, 8,   23,  61,  0, 125, 34,  187, 0,
+        52,  41,  133, 0, 6,   31,  56,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        37,  109, 153, 0, 51,  102, 147, 0, 23,  87,  128, 0, 8,   67,  101, 0, 1,   41,  63,  0,
+        1,   19,  29,  0, 31,  154, 185, 0, 17,  127, 175, 0, 6,   96,  145, 0, 2,   73,  114, 0,
+        1,   51,  82,  0, 1,   28,  45,  0, 23,  163, 200, 0, 10,  131, 185, 0, 2,   93,  148, 0,
+        1,   67,  111, 0, 1,   41,  69,  0, 1,   14,  24,  0, 29,  176, 217, 0, 12,  145, 201, 0,
+        3,   101, 156, 0, 1,   69,  111, 0, 1,   39,  63,  0, 1,   14,  23,  0, 57,  192, 233, 0,
+        25,  154, 215, 0, 6,   109, 167, 0, 3,   78,  118, 0, 1,   48,  69,  0, 1,   21,  29,  0,
+        202, 105, 245, 0, 108, 106, 216, 0, 18,  90,  144, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 33,  172, 219, 0, 64,  149, 206, 0, 14,  117, 177, 0, 5,   90,  141, 0,
+        2,   61,  95,  0, 1,   37,  57,  0, 33,  179, 220, 0, 11,  140, 198, 0, 1,   89,  148, 0,
+        1,   60,  104, 0, 1,   33,  57,  0, 1,   12,  21,  0, 30,  181, 221, 0, 8,   141, 198, 0,
+        1,   87,  145, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  20,  0, 32,  186, 224, 0,
+        7,   142, 198, 0, 1,   86,  143, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  22,  0,
+        57,  192, 227, 0, 20,  143, 204, 0, 3,   96,  154, 0, 1,   68,  112, 0, 1,   42,  69,  0,
+        1,   19,  32,  0, 212, 35,  215, 0, 113, 47,  169, 0, 29,  48,  105, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 74,  129, 203, 0, 106, 120, 203, 0, 49,  107, 178, 0,
+        19,  84,  144, 0, 4,   50,  84,  0, 1,   15,  25,  0, 71,  172, 217, 0, 44,  141, 209, 0,
+        15,  102, 173, 0, 6,   76,  133, 0, 2,   51,  89,  0, 1,   24,  42,  0, 64,  185, 231, 0,
+        31,  148, 216, 0, 8,   103, 175, 0, 3,   74,  131, 0, 1,   46,  81,  0, 1,   18,  30,  0,
+        65,  196, 235, 0, 25,  157, 221, 0, 5,   105, 174, 0, 1,   67,  120, 0, 1,   38,  69,  0,
+        1,   15,  30,  0, 65,  204, 238, 0, 30,  156, 224, 0, 7,   107, 177, 0, 2,   70,  124, 0,
+        1,   42,  73,  0, 1,   18,  34,  0, 225, 86,  251, 0, 144, 104, 235, 0, 42,  99,  181, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  175, 239, 0, 112, 165, 229, 0,
+        29,  136, 200, 0, 12,  103, 162, 0, 6,   77,  123, 0, 2,   53,  84,  0, 75,  183, 239, 0,
+        30,  155, 221, 0, 3,   106, 171, 0, 1,   74,  128, 0, 1,   44,  76,  0, 1,   17,  28,  0,
+        73,  185, 240, 0, 27,  159, 222, 0, 2,   107, 172, 0, 1,   75,  127, 0, 1,   42,  73,  0,
+        1,   17,  29,  0, 62,  190, 238, 0, 21,  159, 222, 0, 2,   107, 172, 0, 1,   72,  122, 0,
+        1,   40,  71,  0, 1,   18,  32,  0, 61,  199, 240, 0, 27,  161, 226, 0, 4,   113, 180, 0,
+        1,   76,  129, 0, 1,   46,  80,  0, 1,   23,  41,  0, 7,   27,  153, 0, 5,   30,  95,  0,
+        1,   16,  30,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 50,  75,  127, 0,
+        57,  75,  124, 0, 27,  67,  108, 0, 10,  54,  86,  0, 1,   33,  52,  0, 1,   12,  18,  0,
+        43,  125, 151, 0, 26,  108, 148, 0, 7,   83,  122, 0, 2,   59,  89,  0, 1,   38,  60,  0,
+        1,   17,  27,  0, 23,  144, 163, 0, 13,  112, 154, 0, 2,   75,  117, 0, 1,   50,  81,  0,
+        1,   31,  51,  0, 1,   14,  23,  0, 18,  162, 185, 0, 6,   123, 171, 0, 1,   78,  125, 0,
+        1,   51,  86,  0, 1,   31,  54,  0, 1,   14,  23,  0, 15,  199, 227, 0, 3,   150, 204, 0,
+        1,   91,  146, 0, 1,   55,  95,  0, 1,   30,  53,  0, 1,   11,  20,  0, 19,  55,  240, 0,
+        19,  59,  196, 0, 3,   52,  105, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        41,  166, 207, 0, 104, 153, 199, 0, 31,  123, 181, 0, 14,  101, 152, 0, 5,   72,  106, 0,
+        1,   36,  52,  0, 35,  176, 211, 0, 12,  131, 190, 0, 2,   88,  144, 0, 1,   60,  101, 0,
+        1,   36,  60,  0, 1,   16,  28,  0, 28,  183, 213, 0, 8,   134, 191, 0, 1,   86,  142, 0,
+        1,   56,  96,  0, 1,   30,  53,  0, 1,   12,  20,  0, 20,  190, 215, 0, 4,   135, 192, 0,
+        1,   84,  139, 0, 1,   53,  91,  0, 1,   28,  49,  0, 1,   11,  20,  0, 13,  196, 216, 0,
+        2,   137, 192, 0, 1,   86,  143, 0, 1,   57,  99,  0, 1,   32,  56,  0, 1,   13,  24,  0,
+        211, 29,  217, 0, 96,  47,  156, 0, 22,  43,  87,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 78,  120, 193, 0, 111, 116, 186, 0, 46,  102, 164, 0, 15,  80,  128, 0,
+        2,   49,  76,  0, 1,   18,  28,  0, 71,  161, 203, 0, 42,  132, 192, 0, 10,  98,  150, 0,
+        3,   69,  109, 0, 1,   44,  70,  0, 1,   18,  29,  0, 57,  186, 211, 0, 30,  140, 196, 0,
+        4,   93,  146, 0, 1,   62,  102, 0, 1,   38,  65,  0, 1,   16,  27,  0, 47,  199, 217, 0,
+        14,  145, 196, 0, 1,   88,  142, 0, 1,   57,  98,  0, 1,   36,  62,  0, 1,   15,  26,  0,
+        26,  219, 229, 0, 5,   155, 207, 0, 1,   94,  151, 0, 1,   60,  104, 0, 1,   36,  62,  0,
+        1,   16,  28,  0, 233, 29,  248, 0, 146, 47,  220, 0, 43,  52,  140, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 100, 163, 232, 0, 179, 161, 222, 0, 63,  142, 204, 0,
+        37,  113, 174, 0, 26,  89,  137, 0, 18,  68,  97,  0, 85,  181, 230, 0, 32,  146, 209, 0,
+        7,   100, 164, 0, 3,   71,  121, 0, 1,   45,  77,  0, 1,   18,  30,  0, 65,  187, 230, 0,
+        20,  148, 207, 0, 2,   97,  159, 0, 1,   68,  116, 0, 1,   40,  70,  0, 1,   14,  29,  0,
+        40,  194, 227, 0, 8,   147, 204, 0, 1,   94,  155, 0, 1,   65,  112, 0, 1,   39,  66,  0,
+        1,   14,  26,  0, 16,  208, 228, 0, 3,   151, 207, 0, 1,   98,  160, 0, 1,   67,  117, 0,
+        1,   41,  74,  0, 1,   17,  31,  0, 17,  38,  140, 0, 7,   34,  80,  0, 1,   17,  29,  0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 37,  75,  128, 0, 41,  76,  128, 0,
+        26,  66,  116, 0, 12,  52,  94,  0, 2,   32,  55,  0, 1,   10,  16,  0, 50,  127, 154, 0,
+        37,  109, 152, 0, 16,  82,  121, 0, 5,   59,  85,  0, 1,   35,  54,  0, 1,   13,  20,  0,
+        40,  142, 167, 0, 17,  110, 157, 0, 2,   71,  112, 0, 1,   44,  72,  0, 1,   27,  45,  0,
+        1,   11,  17,  0, 30,  175, 188, 0, 9,   124, 169, 0, 1,   74,  116, 0, 1,   48,  78,  0,
+        1,   30,  49,  0, 1,   11,  18,  0, 10,  222, 223, 0, 2,   150, 194, 0, 1,   83,  128, 0,
+        1,   48,  79,  0, 1,   27,  45,  0, 1,   11,  17,  0, 36,  41,  235, 0, 29,  36,  193, 0,
+        10,  27,  111, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  165, 222, 0,
+        177, 162, 215, 0, 110, 135, 195, 0, 57,  113, 168, 0, 23,  83,  120, 0, 10,  49,  61,  0,
+        85,  190, 223, 0, 36,  139, 200, 0, 5,   90,  146, 0, 1,   60,  103, 0, 1,   38,  65,  0,
+        1,   18,  30,  0, 72,  202, 223, 0, 23,  141, 199, 0, 2,   86,  140, 0, 1,   56,  97,  0,
+        1,   36,  61,  0, 1,   16,  27,  0, 55,  218, 225, 0, 13,  145, 200, 0, 1,   86,  141, 0,
+        1,   57,  99,  0, 1,   35,  61,  0, 1,   13,  22,  0, 15,  235, 212, 0, 1,   132, 184, 0,
+        1,   84,  139, 0, 1,   57,  97,  0, 1,   34,  56,  0, 1,   14,  23,  0, 181, 21,  201, 0,
+        61,  37,  123, 0, 10,  38,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        47,  106, 172, 0, 95,  104, 173, 0, 42,  93,  159, 0, 18,  77,  131, 0, 4,   50,  81,  0,
+        1,   17,  23,  0, 62,  147, 199, 0, 44,  130, 189, 0, 28,  102, 154, 0, 18,  75,  115, 0,
+        2,   44,  65,  0, 1,   12,  19,  0, 55,  153, 210, 0, 24,  130, 194, 0, 3,   93,  146, 0,
+        1,   61,  97,  0, 1,   31,  50,  0, 1,   10,  16,  0, 49,  186, 223, 0, 17,  148, 204, 0,
+        1,   96,  142, 0, 1,   53,  83,  0, 1,   26,  44,  0, 1,   11,  17,  0, 13,  217, 212, 0,
+        2,   136, 180, 0, 1,   78,  124, 0, 1,   50,  83,  0, 1,   29,  49,  0, 1,   14,  23,  0,
+        197, 13,  247, 0, 82,  17,  222, 0, 25,  17,  162, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 126, 186, 247, 0, 234, 191, 243, 0, 176, 177, 234, 0, 104, 158, 220, 0,
+        66,  128, 186, 0, 55,  90,  137, 0, 111, 197, 242, 0, 46,  158, 219, 0, 9,   104, 171, 0,
+        2,   65,  125, 0, 1,   44,  80,  0, 1,   17,  91,  0, 104, 208, 245, 0, 39,  168, 224, 0,
+        3,   109, 162, 0, 1,   79,  124, 0, 1,   50,  102, 0, 1,   43,  102, 0, 84,  220, 246, 0,
+        31,  177, 231, 0, 2,   115, 180, 0, 1,   79,  134, 0, 1,   55,  77,  0, 1,   60,  79,  0,
+        43,  243, 240, 0, 8,   180, 217, 0, 1,   115, 166, 0, 1,   84,  121, 0, 1,   51,  67,  0,
+        1,   16,  6,   0,
+    },
+    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
+    .inter_mode_prob{
+        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94,
+        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0,
+    },
+    .intra_inter_prob{9, 102, 187, 225},
+    .comp_inter_prob{9, 102, 187, 225, 0},
+    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
+    .comp_ref_prob{50, 126, 123, 221, 226},
+    .tx_32x32_prob{3, 136, 37, 5, 52, 13},
+    .tx_16x16_prob{20, 152, 15, 101},
+    .tx_8x8_prob{100, 66},
+    .skip_probs{192, 128, 64},
+    .joints{32, 64, 96},
+    .sign{128, 128},
+    .classes{
+        224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
+        216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
+    },
+    .class_0{216, 208},
+    .prob_bits{
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+    },
+    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
+    .fr{64, 96, 64, 64, 96, 64},
+    .class_0_hp{160, 160},
+    .high_precision{128, 128},
+};
+
+constexpr std::array<s32, 256> norm_lut{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+constexpr std::array<s32, 254> map_lut{
+    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
+    1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,  51,  52,  53,  54,
+    55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
+    73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+    108, 109, 7,   110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124,
+    125, 126, 127, 128, 129, 130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142,
+    143, 144, 145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176, 177,
+    178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14,  194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212,
+    213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
+    230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18,  242, 243, 244, 245, 246, 247,
+    248, 249, 250, 251, 252, 253, 19,
+};
+
+// 6.2.14 Tile size calculation
+
+[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 min_log2 = 0;
+
+    while ((64 << min_log2) < sb64_cols) {
+        min_log2++;
+    }
+
+    return min_log2;
+}
+
+[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 max_log2 = 1;
+
+    while ((sb64_cols >> max_log2) >= 4) {
+        max_log2++;
+    }
+
+    return max_log2 - 1;
+}
+
+// Recenters probability. Based on section 6.3.6 of VP9 Specification
+[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
+    if (new_prob > old_prob * 2) {
+        return new_prob;
+    }
+
+    if (new_prob >= old_prob) {
+        return (new_prob - old_prob) * 2;
+    }
+
+    return (old_prob - new_prob) * 2 - 1;
+}
+
+// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
+[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
+    new_prob--;
+    old_prob--;
+
+    std::size_t index{};
+
+    if (old_prob * 2 <= 0xff) {
+        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+    } else {
+        index = static_cast<std::size_t>(
+            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+    }
+
+    return map_lut[index];
+}
+} // Anonymous namespace
+
+VP9::VP9(GPU& gpu) : gpu(gpu) {}
+
+VP9::~VP9() = default;
+
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        WriteProbabilityDelta(writer, new_prob, old_prob);
+    }
+}
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                 const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
+        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
+    }
+}
+
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                         const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
+        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
+    }
+}
+
+void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const int delta = RemapProbability(new_prob, old_prob);
+
+    EncodeTermSubExp(writer, delta);
+}
+
+void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
+    if (WriteLessThan(writer, value, 16)) {
+        writer.Write(value, 4);
+    } else if (WriteLessThan(writer, value, 32)) {
+        writer.Write(value - 16, 4);
+    } else if (WriteLessThan(writer, value, 64)) {
+        writer.Write(value - 32, 5);
+    } else {
+        value -= 64;
+
+        constexpr s32 size = 8;
+
+        const s32 mask = (1 << size) - 191;
+
+        const s32 delta = value - mask;
+
+        if (delta < 0) {
+            writer.Write(value, size - 1);
+        } else {
+            writer.Write(delta / 2 + mask, size - 1);
+            writer.Write(delta & 1, 1);
+        }
+    }
+}
+
+bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
+    const bool is_lt = value < test;
+    writer.Write(!is_lt);
+    return is_lt;
+}
+
+void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                     const std::array<u8, 2304>& new_prob,
+                                     const std::array<u8, 2304>& old_prob) {
+    // Note: There's 1 byte added on each packet for alignment,
+    // this byte is ignored when doing updates.
+    constexpr s32 block_bytes = 2 * 2 * 6 * 6 * 4;
+
+    const auto needs_update = [&](s32 base_index) -> bool {
+        s32 index = base_index;
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 6; k++) {
+                    for (s32 l = 0; l < 6; l++) {
+                        if (new_prob[index + 0] != old_prob[index + 0] ||
+                            new_prob[index + 1] != old_prob[index + 1] ||
+                            new_prob[index + 2] != old_prob[index + 2]) {
+                            return true;
+                        }
+
+                        index += 4;
+                    }
+                }
+            }
+        }
+        return false;
+    };
+
+    for (s32 block_index = 0; block_index < 4; block_index++) {
+        const s32 base_index = block_index * block_bytes;
+        const bool update = needs_update(base_index);
+        writer.Write(update);
+
+        if (update) {
+            s32 index = base_index;
+            for (s32 i = 0; i < 2; i++) {
+                for (s32 j = 0; j < 2; j++) {
+                    for (s32 k = 0; k < 6; k++) {
+                        for (s32 l = 0; l < 6; l++) {
+                            if (k != 0 || l < 3) {
+                                WriteProbabilityUpdate(writer, new_prob[index + 0],
+                                                       old_prob[index + 0]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 1],
+                                                       old_prob[index + 1]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 2],
+                                                       old_prob[index + 2]);
+                            }
+                            index += 4;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (block_index == tx_mode) {
+            break;
+        }
+    }
+}
+
+void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        writer.Write(new_prob >> 1, 7);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();
+
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+
+    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
+
+    return vp9_info;
+}
+
+void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
+    EntropyProbs entropy{};
+    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    entropy.Convert(dst);
+}
+
+Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
+    Vp9FrameContainer frame{};
+    {
+        gpu.SyncGuestHost();
+        frame.info = GetVp9PictureInfo(state);
+
+        frame.bit_stream.resize(frame.info.bitstream_size);
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(),
+                                      frame.info.bitstream_size);
+    }
+    // Buffer two frames, saving the last show frame info
+    if (!next_next_frame.bit_stream.empty()) {
+        Vp9FrameContainer temp{
+            .info = frame.info,
+            .bit_stream = frame.bit_stream,
+        };
+        next_next_frame.info.show_frame = frame.info.last_frame_shown;
+        frame.info = next_next_frame.info;
+        frame.bit_stream = next_next_frame.bit_stream;
+        next_next_frame = std::move(temp);
+
+        if (!next_frame.bit_stream.empty()) {
+            Vp9FrameContainer temp2{
+                .info = frame.info,
+                .bit_stream = frame.bit_stream,
+            };
+            next_frame.info.show_frame = frame.info.last_frame_shown;
+            frame.info = next_frame.info;
+            frame.bit_stream = next_frame.bit_stream;
+            next_frame = std::move(temp2);
+        } else {
+            next_frame.info = frame.info;
+            next_frame.bit_stream = frame.bit_stream;
+        }
+    } else {
+        next_next_frame.info = frame.info;
+        next_next_frame.bit_stream = frame.bit_stream;
+    }
+    return frame;
+}
+
+std::vector<u8> VP9::ComposeCompressedHeader() {
+    VpxRangeEncoder writer{};
+
+    if (!current_frame_info.lossless) {
+        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
+            writer.Write(3, 2);
+            writer.Write(current_frame_info.transform_mode == 4);
+        } else {
+            writer.Write(current_frame_info.transform_mode, 2);
+        }
+    }
+
+    if (current_frame_info.transform_mode == 4) {
+        // tx_mode_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
+                               prev_frame_probs.tx_8x8_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
+                               prev_frame_probs.tx_16x16_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
+                               prev_frame_probs.tx_32x32_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
+            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
+            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
+        }
+    }
+    // read_coef_probs()  in the spec
+    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
+                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
+    // read_skip_probs()  in the spec
+    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
+                           prev_frame_probs.skip_probs);
+
+    if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
+        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
+    }
+
+    if (!current_frame_info.intra_only) {
+        // read_inter_probs() in the spec
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
+                                       prev_frame_probs.inter_mode_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
+        }
+
+        if (current_frame_info.interp_filter == 4) {
+            // read_interp_filter_probs() in the spec
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
+                                   prev_frame_probs.switchable_interp_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.switchable_interp_prob =
+                    current_frame_info.entropy.switchable_interp_prob;
+            }
+        }
+
+        // read_is_inter_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
+                               prev_frame_probs.intra_inter_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
+        }
+        // frame_reference_mode() in the spec
+        if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[2] & 1) ||
+            (current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[3] & 1)) {
+            if (current_frame_info.reference_mode >= 1) {
+                writer.Write(1, 1);
+                writer.Write(current_frame_info.reference_mode == 2);
+            } else {
+                writer.Write(0, 1);
+            }
+        }
+
+        // frame_reference_mode_probs() in the spec
+        if (current_frame_info.reference_mode == 2) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
+                                   prev_frame_probs.comp_inter_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 1) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
+                                   prev_frame_probs.single_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 0) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
+                                   prev_frame_probs.comp_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
+            }
+        }
+
+        // read_y_mode_probs
+        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
+             ++index) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
+                                   prev_frame_probs.y_mode_prob[index]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
+        }
+        // read_partition_probs
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
+                                       prev_frame_probs.partition_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
+        }
+
+        // mv_probs
+        for (s32 i = 0; i < 3; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
+                                     prev_frame_probs.joints[i]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.joints = current_frame_info.entropy.joints;
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
+                                     prev_frame_probs.sign[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
+                                         prev_frame_probs.classes[index]);
+            }
+
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
+                                     prev_frame_probs.class_0[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
+                                         prev_frame_probs.prob_bits[index]);
+            }
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 3; k++) {
+                    const int index = i * 2 * 3 + j * 3 + k;
+
+                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
+                                             prev_frame_probs.class_0_fr[index]);
+                }
+            }
+
+            for (s32 j = 0; j < 3; j++) {
+                const int index = i * 3 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
+                                         prev_frame_probs.fr[index]);
+            }
+        }
+
+        if (current_frame_info.allow_high_precision_mv) {
+            for (s32 index = 0; index < 2; index++) {
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
+                                         prev_frame_probs.class_0_hp[index]);
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
+                                         prev_frame_probs.high_precision[index]);
+            }
+        }
+
+        // save previous probs
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.sign = current_frame_info.entropy.sign;
+            prev_frame_probs.classes = current_frame_info.entropy.classes;
+            prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
+            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
+            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
+            prev_frame_probs.fr = current_frame_info.entropy.fr;
+            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
+            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
+        }
+    }
+
+    writer.End();
+    return writer.GetBuffer();
+}
+
+VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
+    VpxBitStreamWriter uncomp_writer{};
+
+    uncomp_writer.WriteU(2, 2);                                      // Frame marker.
+    uncomp_writer.WriteU(0, 2);                                      // Profile.
+    uncomp_writer.WriteBit(false);                                   // Show existing frame.
+    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame?
+    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame?
+    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
+
+    if (current_frame_info.is_key_frame) {
+        uncomp_writer.WriteU(frame_sync_code, 24);
+        uncomp_writer.WriteU(0, 3); // Color space.
+        uncomp_writer.WriteU(0, 1); // Color range.
+        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+        uncomp_writer.WriteBit(false); // Render and frame size different.
+
+        // Reset context
+        prev_frame_probs = default_probs;
+        swap_next_golden = false;
+        loop_filter_ref_deltas.fill(0);
+        loop_filter_mode_deltas.fill(0);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});
+
+        // intra only, meaning the frame can be recreated with no other references
+        current_frame_info.intra_only = true;
+
+    } else {
+
+        if (!current_frame_info.show_frame) {
+            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
+        } else {
+            current_frame_info.intra_only = false;
+        }
+        if (!current_frame_info.error_resilient_mode) {
+            uncomp_writer.WriteU(0, 2); // Reset frame context.
+        }
+
+        // Last, Golden, Altref frames
+        std::array<s32, 3> ref_frame_index{0, 1, 2};
+
+        // Set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
+            }
+        }
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
+        if (current_frame_info.intra_only) {
+            uncomp_writer.WriteU(frame_sync_code, 24);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+        } else {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
+                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
+                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
+            }
+
+            uncomp_writer.WriteBit(true);  // Frame size with refs.
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
+            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
+
+            if (current_frame_info.interp_filter != 4) {
+                uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
+            }
+        }
+    }
+
+    if (!current_frame_info.error_resilient_mode) {
+        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
+        uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
+    }
+
+    int frame_ctx_idx = 0;
+    if (!current_frame_info.show_frame) {
+        frame_ctx_idx = 1;
+    }
+
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+
+    uncomp_writer.WriteU(current_frame_info.first_level, 6);
+    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
+    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
+
+    if (current_frame_info.mode_ref_delta_enabled) {
+        // check if ref deltas are different, update accordingly
+        std::array<bool, 4> update_loop_filter_ref_deltas;
+        std::array<bool, 2> update_loop_filter_mode_deltas;
+
+        bool loop_filter_delta_update = false;
+
+        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_ref_deltas[index];
+            const s8 new_deltas = current_frame_info.ref_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_ref_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_mode_deltas[index];
+            const s8 new_deltas = current_frame_info.mode_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_mode_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        uncomp_writer.WriteBit(loop_filter_delta_update);
+
+        if (loop_filter_delta_update) {
+            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
+
+                if (update_loop_filter_ref_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
+                }
+            }
+
+            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
+
+                if (update_loop_filter_mode_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
+                }
+            }
+            // save new deltas
+            loop_filter_ref_deltas = current_frame_info.ref_deltas;
+            loop_filter_mode_deltas = current_frame_info.mode_deltas;
+        }
+    }
+
+    uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
+
+    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
+
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+
+    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
+    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
+
+    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
+    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
+
+    // If it's less than the maximum, we need to add an extra 0 on the bitstream
+    // to indicate that it should stop reading.
+    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
+    } else {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
+    }
+
+    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
+
+    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
+
+    if (tile_rows_log2_is_nonzero) {
+        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
+    }
+
+    return uncomp_writer;
+}
+
+const std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) {
+    std::vector<u8> bitstream;
+    {
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        current_frame_info = curr_frame.info;
+        bitstream = std::move(curr_frame.bit_stream);
+    }
+
+    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
+    auto uncomp_writer = ComposeUncompressedHeader();
+    std::vector<u8> compressed_header = ComposeCompressedHeader();
+
+    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
+    uncomp_writer.Flush();
+    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
+
+    // Write headers and frame to buffer
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
+                compressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
+                bitstream.data(), bitstream.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
+    return frame;
+}
+
+VpxRangeEncoder::VpxRangeEncoder() {
+    Write(false);
+}
+
+VpxRangeEncoder::~VpxRangeEncoder() = default;
+
+void VpxRangeEncoder::Write(s32 value, s32 value_size) {
+    for (s32 bit = value_size - 1; bit >= 0; bit--) {
+        Write(((value >> bit) & 1) != 0);
+    }
+}
+
+void VpxRangeEncoder::Write(bool bit) {
+    Write(bit, half_probability);
+}
+
+void VpxRangeEncoder::Write(bool bit, s32 probability) {
+    u32 local_range = range;
+    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
+    local_range = split;
+
+    if (bit) {
+        low_value += split;
+        local_range = range - split;
+    }
+
+    s32 shift = norm_lut[local_range];
+    local_range <<= shift;
+    count += shift;
+
+    if (count >= 0) {
+        const s32 offset = shift - count;
+
+        if (((low_value << (offset - 1)) >> 31) != 0) {
+            const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
+            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+            while (PeekByte() == 0xff) {
+                base_stream.WriteByte(0);
+
+                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
+            }
+            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
+            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
+        }
+        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
+
+        low_value <<= offset;
+        shift = count;
+        low_value &= 0xffffff;
+        count -= 8;
+    }
+
+    low_value <<= shift;
+    range = local_range;
+}
+
+void VpxRangeEncoder::End() {
+    for (std::size_t index = 0; index < 32; ++index) {
+        Write(false);
+    }
+}
+
+u8 VpxRangeEncoder::PeekByte() {
+    const u8 value = base_stream.ReadByte();
+    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+
+    return value;
+}
+
+VpxBitStreamWriter::VpxBitStreamWriter() = default;
+
+VpxBitStreamWriter::~VpxBitStreamWriter() = default;
+
+void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
+    WriteBits(value, value_size);
+}
+
+void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
+    const bool sign = value < 0;
+    if (sign) {
+        value = -value;
+    }
+
+    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
+}
+
+void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
+    const bool delta_coded = value != 0;
+    WriteBit(delta_coded);
+
+    if (delta_coded) {
+        WriteBits(value, 4);
+    }
+}
+
+void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
+    s32 value_pos = 0;
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free = GetFreeBufferBits();
+
+        if (copy_size > free) {
+            copy_size = free;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void VpxBitStreamWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+s32 VpxBitStreamWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void VpxBitStreamWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+    buffer = 0;
+    buffer_pos = 0;
+}
+
+std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
+    return byte_array;
+}
+
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h
new file mode 100644
index 000000000..e2504512c
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -0,0 +1,196 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/stream.h"
+#include "video_core/command_classes/codecs/vp9_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
+namespace Decoder {
+
+/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
+/// VP9 header bitstreams.
+
+class VpxRangeEncoder {
+public:
+    VpxRangeEncoder();
+    ~VpxRangeEncoder();
+
+    VpxRangeEncoder(const VpxRangeEncoder&) = delete;
+    VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
+
+    VpxRangeEncoder(VpxRangeEncoder&&) = default;
+    VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
+
+    /// Writes the rightmost value_size bits from value into the stream
+    void Write(s32 value, s32 value_size);
+
+    /// Writes a single bit with half probability
+    void Write(bool bit);
+
+    /// Writes a bit to the base_stream encoded with probability
+    void Write(bool bit, s32 probability);
+
+    /// Signal the end of the bitstream
+    void End();
+
+    [[nodiscard]] std::vector<u8>& GetBuffer() {
+        return base_stream.GetBuffer();
+    }
+
+    [[nodiscard]] const std::vector<u8>& GetBuffer() const {
+        return base_stream.GetBuffer();
+    }
+
+private:
+    u8 PeekByte();
+    Common::Stream base_stream{};
+    u32 low_value{};
+    u32 range{0xff};
+    s32 count{-24};
+    s32 half_probability{128};
+};
+
+class VpxBitStreamWriter {
+public:
+    VpxBitStreamWriter();
+    ~VpxBitStreamWriter();
+
+    VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
+    VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
+
+    VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
+    VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
+
+    /// Write an unsigned integer value
+    void WriteU(u32 value, u32 value_size);
+
+    /// Write a signed integer value
+    void WriteS(s32 value, u32 value_size);
+
+    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
+    void WriteDeltaQ(u32 value);
+
+    /// Write a single bit.
+    void WriteBit(bool state);
+
+    /// Pushes current buffer into buffer_array, resets buffer
+    void Flush();
+
+    /// Returns byte_array
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+
+    /// Returns const byte_array
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    /// Write bit_count bits from value into buffer
+    void WriteBits(u32 value, u32 bit_count);
+
+    /// Gets next available position in buffer, invokes Flush() if buffer is full
+    s32 GetFreeBufferBits();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class VP9 {
+public:
+    explicit VP9(GPU& gpu);
+    ~VP9();
+
+    VP9(const VP9&) = delete;
+    VP9& operator=(const VP9&) = delete;
+
+    VP9(VP9&&) = default;
+    VP9& operator=(VP9&&) = delete;
+
+    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
+    /// documentation
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
+
+    /// Returns true if the most recent frame was a hidden frame.
+    [[nodiscard]] bool WasFrameHidden() const {
+        return hidden;
+    }
+
+private:
+    /// Generates compressed header probability updates in the bitstream writer
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                const std::array<T, N>& old_prob);
+
+    /// Generates compressed header probability updates in the bitstream writer
+    /// If probs are not equal, WriteProbabilityDelta is invoked
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Generates compressed header probability deltas in the bitstream writer
+    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Inverse of 6.3.4 Decode term subexp
+    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
+
+    /// Writes if the value is less than the test value
+    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
+
+    /// Writes probability updates for the Coef probabilities
+    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                    const std::array<u8, 2304>& new_prob,
+                                    const std::array<u8, 2304>& old_prob);
+
+    /// Write probabilities for 4-byte aligned structures
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                        const std::array<T, N>& old_prob);
+
+    /// Write motion vector probability updates. 6.3.17 in the spec
+    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Returns VP9 information from NVDEC provided offset and size
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
+
+    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
+    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
+
+    /// Returns frame to be decoded after buffering
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
+
+    /// Use NVDEC providied information to compose the headers for the current frame
+    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
+    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
+
+    GPU& gpu;
+    std::vector<u8> frame;
+
+    std::array<s8, 4> loop_filter_ref_deltas{};
+    std::array<s8, 2> loop_filter_mode_deltas{};
+
+    bool hidden = false;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
+    Vp9FrameContainer next_frame{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};
+
+    Vp9PictureInfo current_frame_info{};
+    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h
new file mode 100644
index 000000000..4f0b05d22
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -0,0 +1,366 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+
+namespace Decoder {
+struct Vp9FrameDimensions {
+    s16 width{};
+    s16 height{};
+    s16 luma_pitch{};
+    s16 chroma_pitch{};
+};
+static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
+
+enum FrameFlags : u32 {
+    IsKeyFrame = 1 << 0,
+    LastFrameIsKeyFrame = 1 << 1,
+    FrameSizeChanged = 1 << 2,
+    ErrorResilientMode = 1 << 3,
+    LastShowFrame = 1 << 4,
+    IntraOnly = 1 << 5,
+};
+
+enum class MvJointType {
+    MvJointZero = 0,   /* Zero vector */
+    MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
+    MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
+    MvJointHnzvnz = 3, /* Both components nonzero */
+};
+enum class MvClassType {
+    MvClass0 = 0,   /* (0, 2]     integer pel */
+    MvClass1 = 1,   /* (2, 4]     integer pel */
+    MvClass2 = 2,   /* (4, 8]     integer pel */
+    MvClass3 = 3,   /* (8, 16]    integer pel */
+    MvClass4 = 4,   /* (16, 32]   integer pel */
+    MvClass5 = 5,   /* (32, 64]   integer pel */
+    MvClass6 = 6,   /* (64, 128]  integer pel */
+    MvClass7 = 7,   /* (128, 256] integer pel */
+    MvClass8 = 8,   /* (256, 512] integer pel */
+    MvClass9 = 9,   /* (512, 1024] integer pel */
+    MvClass10 = 10, /* (1024,2048] integer pel */
+};
+
+enum class BlockSize {
+    Block4x4 = 0,
+    Block4x8 = 1,
+    Block8x4 = 2,
+    Block8x8 = 3,
+    Block8x16 = 4,
+    Block16x8 = 5,
+    Block16x16 = 6,
+    Block16x32 = 7,
+    Block32x16 = 8,
+    Block32x32 = 9,
+    Block32x64 = 10,
+    Block64x32 = 11,
+    Block64x64 = 12,
+    BlockSizes = 13,
+    BlockInvalid = BlockSizes
+};
+
+enum class PredictionMode {
+    DcPred = 0,   // Average of above and left pixels
+    VPred = 1,    // Vertical
+    HPred = 2,    // Horizontal
+    D45Pred = 3,  // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
+    D135Pred = 4, // Directional 135 deg = 180 - 45
+    D117Pred = 5, // Directional 117 deg = 180 - 63
+    D153Pred = 6, // Directional 153 deg = 180 - 27
+    D207Pred = 7, // Directional 207 deg = 180 + 27
+    D63Pred = 8,  // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
+    TmPred = 9,   // True-motion
+    NearestMv = 10,
+    NearMv = 11,
+    ZeroMv = 12,
+    NewMv = 13,
+    MbModeCount = 14
+};
+
+enum class TxSize {
+    Tx4x4 = 0,   // 4x4 transform
+    Tx8x8 = 1,   // 8x8 transform
+    Tx16x16 = 2, // 16x16 transform
+    Tx32x32 = 3, // 32x32 transform
+    TxSizes = 4
+};
+
+enum class TxMode {
+    Only4X4 = 0,      // Only 4x4 transform used
+    Allow8X8 = 1,     // Allow block transform size up to 8x8
+    Allow16X16 = 2,   // Allow block transform size up to 16x16
+    Allow32X32 = 3,   // Allow block transform size up to 32x32
+    TxModeSelect = 4, // Transform specified for each block
+    TxModes = 5
+};
+
+enum class reference_mode {
+    SingleReference = 0,
+    CompoundReference = 1,
+    ReferenceModeSelect = 2,
+    ReferenceModes = 3
+};
+
+struct Segmentation {
+    u8 enabled{};
+    u8 update_map{};
+    u8 temporal_update{};
+    u8 abs_delta{};
+    std::array<u32, 8> feature_mask{};
+    std::array<std::array<s16, 4>, 8> feature_data{};
+};
+static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
+
+struct LoopFilter {
+    u8 mode_ref_delta_enabled{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+};
+static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
+
+struct Vp9EntropyProbs {
+    std::array<u8, 36> y_mode_prob{};
+    std::array<u8, 64> partition_prob{};
+    std::array<u8, 2304> coef_probs{};
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 3> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 20> prob_bits{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+};
+static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size");
+
+struct Vp9PictureInfo {
+    bool is_key_frame{};
+    bool intra_only{};
+    bool last_frame_was_key{};
+    bool frame_size_changed{};
+    bool error_resilient_mode{};
+    bool last_frame_shown{};
+    bool show_frame{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    s32 base_q_index{};
+    s32 y_dc_delta_q{};
+    s32 uv_dc_delta_q{};
+    s32 uv_ac_delta_q{};
+    bool lossless{};
+    s32 transform_mode{};
+    bool allow_high_precision_mv{};
+    s32 interp_filter{};
+    s32 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    s32 log2_tile_cols{};
+    s32 log2_tile_rows{};
+    bool segment_enabled{};
+    bool segment_map_update{};
+    bool segment_map_temporal_update{};
+    s32 segment_abs_delta{};
+    std::array<u32, 8> segment_feature_enable{};
+    std::array<std::array<s16, 4>, 8> segment_feature_data{};
+    bool mode_ref_delta_enabled{};
+    bool use_prev_in_find_mv_refs{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+    Vp9EntropyProbs entropy{};
+    Vp9FrameDimensions frame_size{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u32 bitstream_size{};
+    std::array<u64, 4> frame_offsets{};
+    std::array<bool, 4> refresh_frame{};
+};
+
+struct Vp9FrameContainer {
+    Vp9PictureInfo info{};
+    std::vector<u8> bit_stream;
+};
+
+struct PictureInfo {
+    INSERT_PADDING_WORDS(12);
+    u32 bitstream_size{};
+    INSERT_PADDING_WORDS(5);
+    Vp9FrameDimensions last_frame_size{};
+    Vp9FrameDimensions golden_frame_size{};
+    Vp9FrameDimensions alt_frame_size{};
+    Vp9FrameDimensions current_frame_size{};
+    u32 vp9_flags{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u8 base_q_index{};
+    u8 y_dc_delta_q{};
+    u8 uv_ac_delta_q{};
+    u8 uv_dc_delta_q{};
+    u8 lossless{};
+    u8 tx_mode{};
+    u8 allow_high_precision_mv{};
+    u8 interp_filter{};
+    u8 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    u8 log2_tile_cols{};
+    u8 log2_tile_rows{};
+    Segmentation segmentation{};
+    LoopFilter loop_filter{};
+    INSERT_PADDING_BYTES(5);
+    u32 surface_params{};
+    INSERT_PADDING_WORDS(3);
+
+    [[nodiscard]] Vp9PictureInfo Convert() const {
+        return {
+            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
+            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
+            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
+            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
+            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
+            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
+            .ref_frame_sign_bias = ref_frame_sign_bias,
+            .base_q_index = base_q_index,
+            .y_dc_delta_q = y_dc_delta_q,
+            .uv_dc_delta_q = uv_dc_delta_q,
+            .uv_ac_delta_q = uv_ac_delta_q,
+            .lossless = lossless != 0,
+            .transform_mode = tx_mode,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .interp_filter = interp_filter,
+            .reference_mode = reference_mode,
+            .comp_fixed_ref = comp_fixed_ref,
+            .comp_var_ref = comp_var_ref,
+            .log2_tile_cols = log2_tile_cols,
+            .log2_tile_rows = log2_tile_rows,
+            .segment_enabled = segmentation.enabled != 0,
+            .segment_map_update = segmentation.update_map != 0,
+            .segment_map_temporal_update = segmentation.temporal_update != 0,
+            .segment_abs_delta = segmentation.abs_delta,
+            .segment_feature_enable = segmentation.feature_mask,
+            .segment_feature_data = segmentation.feature_data,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
+            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
+                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
+                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
+                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
+                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
+            .ref_deltas = loop_filter.ref_deltas,
+            .mode_deltas = loop_filter.mode_deltas,
+            .frame_size = current_frame_size,
+            .first_level = first_level,
+            .sharpness_level = sharpness_level,
+            .bitstream_size = bitstream_size,
+        };
+    }
+};
+static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
+
+struct EntropyProbs {
+    INSERT_PADDING_BYTES(1024);
+    std::array<std::array<u8, 4>, 7> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    INSERT_PADDING_BYTES(80);
+    std::array<std::array<u8, 1>, 2> tx_8x8_prob{};
+    std::array<std::array<u8, 2>, 2> tx_16x16_prob{};
+    std::array<std::array<u8, 3>, 2> tx_32x32_prob{};
+    std::array<u8, 4> y_mode_prob_e8{};
+    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
+    INSERT_PADDING_BYTES(64);
+    std::array<std::array<u8, 4>, 16> partition_prob{};
+    INSERT_PADDING_BYTES(10);
+    std::array<std::array<u8, 2>, 4> switchable_interp_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 4> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<std::array<u8, 1>, 2> class_0{};
+    std::array<std::array<u8, 3>, 2> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+    std::array<std::array<u8, 10>, 2> classes{};
+    std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{};
+    std::array<std::array<u8, 10>, 2> pred_bits{};
+    std::array<std::array<u8, 2>, 5> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    INSERT_PADDING_BYTES(17);
+    std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4>
+        coef_probs{};
+
+    void Convert(Vp9EntropyProbs& fc) {
+        std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size());
+
+        std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(),
+                    fc.intra_inter_prob.size());
+
+        std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size());
+        std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size());
+        std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size());
+
+        for (s32 i = 0; i < 4; i++) {
+            for (s32 j = 0; j < 9; j++) {
+                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
+            }
+        }
+
+        std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size());
+
+        std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(),
+                    fc.switchable_interp_prob.size());
+        std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size());
+        std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size());
+
+        std::memcpy(fc.joints.data(), joints.data(), fc.joints.size());
+
+        std::memcpy(fc.sign.data(), sign.data(), fc.sign.size());
+        std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size());
+        std::memcpy(fc.fr.data(), fr.data(), fc.fr.size());
+        std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size());
+        std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size());
+        std::memcpy(fc.classes.data(), classes.data(), fc.classes.size());
+        std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size());
+        std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size());
+        std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size());
+        std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size());
+
+        std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size());
+    }
+};
+static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
+
+enum class Ref { Last, Golden, AltRef };
+
+struct RefPoolElement {
+    s64 frame{};
+    Ref ref{};
+    bool refresh{};
+};
+
+struct FrameContexts {
+    s64 from{};
+    bool adapted{};
+    Vp9EntropyProbs probs{};
+};
+
+}; // namespace Decoder
+}; // namespace Tegra
diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp
new file mode 100644
index 000000000..c4dd4881a
--- /dev/null
+++ b/src/video_core/command_classes/host1x.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/host1x.h"
+#include "video_core/gpu.h"
+
+Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
+
+Tegra::Host1x::~Host1x() = default;
+
+void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    StateWrite(static_cast<u32>(method), arguments[0]);
+    switch (method) {
+    case Method::WaitSyncpt:
+        Execute(arguments[0]);
+        break;
+    case Method::LoadSyncptPayload32:
+        syncpoint_value = arguments[0];
+        break;
+    case Method::WaitSyncpt32:
+        Execute(arguments[0]);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
+        break;
+    }
+}
+
+void Tegra::Host1x::Execute(u32 data) {
+    // This method waits on a valid syncpoint.
+    // TODO: Implement when proper Async is in place
+}
diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h
new file mode 100644
index 000000000..013eaa0c1
--- /dev/null
+++ b/src/video_core/command_classes/host1x.h
@@ -0,0 +1,78 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+class Host1x {
+public:
+    struct Host1xClassRegisters {
+        u32 incr_syncpt{};
+        u32 incr_syncpt_ctrl{};
+        u32 incr_syncpt_error{};
+        INSERT_PADDING_WORDS(5);
+        u32 wait_syncpt{};
+        u32 wait_syncpt_base{};
+        u32 wait_syncpt_incr{};
+        u32 load_syncpt_base{};
+        u32 incr_syncpt_base{};
+        u32 clear{};
+        u32 wait{};
+        u32 wait_with_interrupt{};
+        u32 delay_use{};
+        u32 tick_count_high{};
+        u32 tick_count_low{};
+        u32 tick_ctrl{};
+        INSERT_PADDING_WORDS(23);
+        u32 ind_ctrl{};
+        u32 ind_off2{};
+        u32 ind_off{};
+        std::array<u32, 31> ind_data{};
+        INSERT_PADDING_WORDS(1);
+        u32 load_syncpoint_payload32{};
+        u32 stall_ctrl{};
+        u32 wait_syncpt32{};
+        u32 wait_syncpt_base32{};
+        u32 load_syncpt_base32{};
+        u32 incr_syncpt_base32{};
+        u32 stall_count_high{};
+        u32 stall_count_low{};
+        u32 xref_ctrl{};
+        u32 channel_xref_high{};
+        u32 channel_xref_low{};
+    };
+    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
+
+    enum class Method : u32 {
+        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
+        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
+        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
+    };
+
+    explicit Host1x(GPU& gpu);
+    ~Host1x();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    /// For Host1x, execute is waiting on a syncpoint previously written into the state
+    void Execute(u32 data);
+
+    /// Write argument into the provided offset
+    void StateWrite(u32 offset, u32 arguments);
+
+    u32 syncpoint_value{};
+    Host1xClassRegisters state{};
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp
new file mode 100644
index 000000000..8ca7a7b06
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -0,0 +1,52 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+
+Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
+
+Nvdec::~Nvdec() = default;
+
+void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    if (method == Method::SetVideoCodec) {
+        codec->StateWrite(static_cast<u32>(method), arguments[0]);
+    } else {
+        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
+    }
+
+    switch (method) {
+    case Method::SetVideoCodec:
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
+        break;
+    case Method::Execute:
+        Execute();
+        break;
+    }
+}
+
+AVFrame* Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
+}
+
+const AVFrame* Nvdec::GetFrame() const {
+    return codec->GetCurrentFrame();
+}
+
+void Nvdec::Execute() {
+    switch (codec->GetCurrentCodec()) {
+    case NvdecCommon::VideoCodec::H264:
+    case NvdecCommon::VideoCodec::Vp9:
+        codec->Decode();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h
new file mode 100644
index 000000000..eec4443f9
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/command_classes/codecs/codec.h"
+
+namespace Tegra {
+class GPU;
+
+class Nvdec {
+public:
+    enum class Method : u32 {
+        SetVideoCodec = 0x80,
+        Execute = 0xc0,
+    };
+
+    explicit Nvdec(GPU& gpu);
+    ~Nvdec();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+    /// Return most recently decoded frame
+    [[nodiscard]] AVFrame* GetFrame();
+    [[nodiscard]] const AVFrame* GetFrame() const;
+
+private:
+    /// Invoke codec to decode a frame
+    void Execute();
+
+    GPU& gpu;
+    std::unique_ptr<Codec> codec;
+};
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h
new file mode 100644
index 000000000..01b5e086d
--- /dev/null
+++ b/src/video_core/command_classes/nvdec_common.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::NvdecCommon {
+
+struct NvdecRegisters {
+    INSERT_PADDING_WORDS(256);
+    u64 set_codec_id{};
+    INSERT_PADDING_WORDS(254);
+    u64 set_platform_id{};
+    u64 picture_info_offset{};
+    u64 frame_bitstream_offset{};
+    u64 frame_number{};
+    u64 h264_slice_data_offsets{};
+    u64 h264_mv_dump_offset{};
+    INSERT_PADDING_WORDS(6);
+    u64 frame_stats_offset{};
+    u64 h264_last_surface_luma_offset{};
+    u64 h264_last_surface_chroma_offset{};
+    std::array<u64, 17> surface_luma_offset{};
+    std::array<u64, 17> surface_chroma_offset{};
+    INSERT_PADDING_WORDS(132);
+    u64 vp9_entropy_probs_offset{};
+    u64 vp9_backward_updates_offset{};
+    u64 vp9_last_frame_segmap_offset{};
+    u64 vp9_curr_frame_segmap_offset{};
+    INSERT_PADDING_WORDS(2);
+    u64 vp9_last_frame_mvs_offset{};
+    u64 vp9_curr_frame_mvs_offset{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
+
+enum class VideoCodec : u32 {
+    None = 0x0,
+    H264 = 0x3,
+    Vp8 = 0x5,
+    H265 = 0x7,
+    Vp9 = 0x9,
+};
+
+} // namespace Tegra::NvdecCommon
diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp
new file mode 100644
index 000000000..19dc9e0ab
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.cpp
@@ -0,0 +1,60 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <algorithm>
+#include "sync_manager.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
+SyncptIncrManager::~SyncptIncrManager() = default;
+
+void SyncptIncrManager::Increment(u32 id) {
+    increments.emplace_back(0, 0, id, true);
+    IncrementAllDone();
+}
+
+u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
+    const u32 handle = current_id++;
+    increments.emplace_back(handle, class_id, id);
+    return handle;
+}
+
+void SyncptIncrManager::SignalDone(u32 handle) {
+    const auto done_incr =
+        std::find_if(increments.begin(), increments.end(),
+                     [handle](const SyncptIncr& incr) { return incr.id == handle; });
+    if (done_incr != increments.cend()) {
+        done_incr->complete = true;
+    }
+    IncrementAllDone();
+}
+
+void SyncptIncrManager::IncrementAllDone() {
+    std::size_t done_count = 0;
+    for (; done_count < increments.size(); ++done_count) {
+        if (!increments[done_count].complete) {
+            break;
+        }
+        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
+    }
+    increments.erase(increments.begin(), increments.begin() + done_count);
+}
+} // namespace Tegra
diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h
new file mode 100644
index 000000000..2c321ec58
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.h
@@ -0,0 +1,64 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+struct SyncptIncr {
+    u32 id;
+    u32 class_id;
+    u32 syncpt_id;
+    bool complete;
+
+    SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
+        : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
+};
+
+class SyncptIncrManager {
+public:
+    explicit SyncptIncrManager(GPU& gpu);
+    ~SyncptIncrManager();
+
+    /// Add syncpoint id and increment all
+    void Increment(u32 id);
+
+    /// Returns a handle to increment later
+    u32 IncrementWhenDone(u32 class_id, u32 id);
+
+    /// IncrememntAllDone, including handle
+    void SignalDone(u32 handle);
+
+    /// Increment all sequential pending increments that are already done.
+    void IncrementAllDone();
+
+private:
+    std::vector<SyncptIncr> increments;
+    std::mutex increment_lock;
+    u32 current_id{};
+
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
new file mode 100644
index 000000000..5b52da277
--- /dev/null
+++ b/src/video_core/command_classes/vic.cpp
@@ -0,0 +1,180 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/command_classes/vic.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/texture_cache/surface_params.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace Tegra {
+
+Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
+    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
+Vic::~Vic() = default;
+
+void Vic::VicStateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
+    VicStateWrite(static_cast<u32>(method), arguments[0]);
+    const u64 arg = static_cast<u64>(arguments[0]) << 8;
+    switch (method) {
+    case Method::Execute:
+        Execute();
+        break;
+    case Method::SetConfigStructOffset:
+        config_struct_address = arg;
+        break;
+    case Method::SetOutputSurfaceLumaOffset:
+        output_surface_luma_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaUOffset:
+        output_surface_chroma_u_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaVOffset:
+        output_surface_chroma_v_address = arg;
+        break;
+    default:
+        break;
+    }
+}
+
+void Vic::Execute() {
+    if (output_surface_luma_address == 0) {
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
+                  vic_state.output_surface.luma_offset);
+        return;
+    }
+    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
+    const VideoPixelFormat pixel_format =
+        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    switch (pixel_format) {
+    case VideoPixelFormat::BGRA8:
+    case VideoPixelFormat::RGBA8: {
+        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+        if (scaler_ctx == nullptr || frame->width != scaler_width ||
+            frame->height != scaler_height) {
+            const AVPixelFormat target_format =
+                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+
+            sws_freeContext(scaler_ctx);
+            scaler_ctx = nullptr;
+
+            // FFmpeg returns all frames in YUV420, convert it into expected format
+            scaler_ctx =
+                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+
+            scaler_width = frame->width;
+            scaler_height = frame->height;
+        }
+        // Get Converted frame
+        const std::size_t linear_size = frame->width * frame->height * 4;
+
+        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
+
+        const int converted_stride{frame->width * 4};
+        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+
+        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
+                  &converted_frame_buf_addr, &converted_stride);
+
+        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+        if (blk_kind != 0) {
+            // swizzle pitch linear to block linear
+            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+                                                            block_height, 0);
+            std::vector<u8> swizzled_data(size);
+            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
+                                             swizzled_data.data(), converted_frame_buffer.get(),
+                                             false, block_height, 0, 1);
+
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        } else {
+            // send pitch linear frame
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                           linear_size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case VideoPixelFormat::Yuv420: {
+        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+
+        const std::size_t surface_width = config.surface_width_minus1 + 1;
+        const std::size_t surface_height = config.surface_height_minus1 + 1;
+        const std::size_t half_width = surface_width / 2;
+        const std::size_t half_height = config.surface_height_minus1 / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+
+        const auto* luma_ptr = frame->data[0];
+        const auto* chroma_b_ptr = frame->data[1];
+        const auto* chroma_r_ptr = frame->data[2];
+        const auto stride = frame->linesize[0];
+        const auto half_stride = frame->linesize[1];
+
+        std::vector<u8> luma_buffer(aligned_width * surface_height);
+        std::vector<u8> chroma_buffer(aligned_width * half_height);
+
+        // Populate luma buffer
+        for (std::size_t y = 0; y < surface_height - 1; ++y) {
+            std::size_t src = y * stride;
+            std::size_t dst = y * aligned_width;
+
+            std::size_t size = surface_width;
+
+            for (std::size_t offset = 0; offset < size; ++offset) {
+                luma_buffer[dst + offset] = luma_ptr[src + offset];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                       luma_buffer.size());
+
+        // Populate chroma buffer from both channels with interleaving.
+        for (std::size_t y = 0; y < half_height; ++y) {
+            std::size_t src = y * half_stride;
+            std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+                                       chroma_buffer.size());
+        gpu.Maxwell3D().OnMemoryWrite();
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
new file mode 100644
index 000000000..8c4e284a1
--- /dev/null
+++ b/src/video_core/command_classes/vic.h
@@ -0,0 +1,110 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+struct SwsContext;
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+struct PlaneOffsets {
+    u32 luma_offset{};
+    u32 chroma_u_offset{};
+    u32 chroma_v_offset{};
+};
+
+struct VicRegisters {
+    INSERT_PADDING_WORDS(64);
+    u32 nop{};
+    INSERT_PADDING_WORDS(15);
+    u32 pm_trigger{};
+    INSERT_PADDING_WORDS(47);
+    u32 set_application_id{};
+    u32 set_watchdog_timer{};
+    INSERT_PADDING_WORDS(17);
+    u32 context_save_area{};
+    u32 context_switch{};
+    INSERT_PADDING_WORDS(43);
+    u32 execute{};
+    INSERT_PADDING_WORDS(63);
+    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
+    u32 picture_index{};
+    u32 control_params{};
+    u32 config_struct_offset{};
+    u32 filter_struct_offset{};
+    u32 palette_offset{};
+    u32 hist_offset{};
+    u32 context_id{};
+    u32 fce_ucode_size{};
+    PlaneOffsets output_surface{};
+    u32 fce_ucode_offset{};
+    INSERT_PADDING_WORDS(4);
+    std::array<u32, 8> slot_context_id{};
+    INSERT_PADDING_WORDS(16);
+};
+static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
+
+class Vic {
+public:
+    enum class Method : u32 {
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaVOffset = 0x1ca
+    };
+
+    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+    ~Vic();
+
+    /// Write to the device state.
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    void Execute();
+
+    void VicStateWrite(u32 offset, u32 arguments);
+    VicRegisters vic_state{};
+
+    enum class VideoPixelFormat : u64_le {
+        RGBA8 = 0x1f,
+        BGRA8 = 0x20,
+        Yuv420 = 0x44,
+    };
+
+    union VicConfig {
+        u64_le raw{};
+        BitField<0, 7, u64_le> pixel_format;
+        BitField<7, 2, u64_le> chroma_loc_horiz;
+        BitField<9, 2, u64_le> chroma_loc_vert;
+        BitField<11, 4, u64_le> block_linear_kind;
+        BitField<15, 4, u64_le> block_linear_height_log2;
+        BitField<19, 3, u64_le> reserved0;
+        BitField<22, 10, u64_le> reserved1;
+        BitField<32, 14, u64_le> surface_width_minus1;
+        BitField<46, 14, u64_le> surface_height_minus1;
+    };
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_v_address{};
+
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..b06c32c84
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,155 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+    PixelFormat::R32G32B32A32_FLOAT,
+    PixelFormat::R32G32B32A32_UINT,
+    PixelFormat::R32G32B32A32_SINT,
+};
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+    PixelFormat::R32G32B32_FLOAT,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_UINT,
+    PixelFormat::R32G32_SINT,        PixelFormat::R16G16B16A16_FLOAT,
+    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
+    PixelFormat::R16G16B16A16_UINT,  PixelFormat::R16G16B16A16_SINT,
+};
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+    PixelFormat::R16G16_FLOAT,      PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
+    PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT,     PixelFormat::R32_UINT,
+    PixelFormat::R16G16_SINT,       PixelFormat::R32_SINT,        PixelFormat::A8B8G8R8_UNORM,
+    PixelFormat::R16G16_UNORM,      PixelFormat::A8B8G8R8_SNORM,  PixelFormat::R16G16_SNORM,
+    PixelFormat::A8B8G8R8_SRGB,     PixelFormat::E5B9G9R9_FLOAT,  PixelFormat::B8G8R8A8_UNORM,
+    PixelFormat::B8G8R8A8_SRGB,     PixelFormat::A8B8G8R8_UINT,   PixelFormat::A8B8G8R8_SINT,
+    PixelFormat::A2B10G10R10_UINT,
+};
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+    PixelFormat::R16_FLOAT,  PixelFormat::R8G8_UINT,  PixelFormat::R16_UINT,
+    PixelFormat::R16_SINT,   PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
+    PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM,  PixelFormat::R8G8_SINT,
+};
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+    PixelFormat::R8_UINT,
+    PixelFormat::R8_UNORM,
+    PixelFormat::R8_SINT,
+    PixelFormat::R8_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+    PixelFormat::BC4_UNORM,
+    PixelFormat::BC4_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+    PixelFormat::BC5_UNORM,
+    PixelFormat::BC5_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+    PixelFormat::BC7_UNORM,
+    PixelFormat::BC7_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+    PixelFormat::BC6H_SFLOAT,
+    PixelFormat::BC6H_UFLOAT,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+    PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
+    PixelFormat::BC2_UNORM,         PixelFormat::BC2_SRGB,           PixelFormat::BC3_UNORM,
+    PixelFormat::BC3_SRGB,          PixelFormat::BC5_UNORM,          PixelFormat::BC5_SNORM,
+    PixelFormat::BC7_UNORM,         PixelFormat::BC7_SRGB,           PixelFormat::BC6H_SFLOAT,
+    PixelFormat::BC6H_UFLOAT,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+    PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
+    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
+    PixelFormat::R16G16B16A16_SINT,  PixelFormat::R32G32_UINT,
+    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_SINT,
+    PixelFormat::BC1_RGBA_UNORM,     PixelFormat::BC1_RGBA_SRGB,
+};
+// Missing formats:
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+    compatiblity[format_a][format_b] = true;
+    compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(compatibility, *it_a, *it_b);
+        }
+    }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+    copy = view;
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..51766349b
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,34 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+    explicit FormatCompatibility();
+
+    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+private:
+    Table view;
+    Table copy;
+};
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 713c14182..d8801b1f5 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/cityhash.h"
 #include "common/microprofile.h"
 #include "core/core.h"
 #include "core/memory.h"
@@ -12,7 +13,7 @@
 
 namespace Tegra {
 
-DmaPusher::DmaPusher(GPU& gpu) : gpu(gpu) {}
+DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {}
 
 DmaPusher::~DmaPusher() = default;
 
@@ -21,17 +22,22 @@ MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128,
 void DmaPusher::DispatchCalls() {
     MICROPROFILE_SCOPE(DispatchCalls);
 
+    gpu.SyncGuestHost();
     // On entering GPU code, assume all memory may be touched by the ARM core.
     gpu.Maxwell3D().OnMemoryWrite();
 
     dma_pushbuffer_subindex = 0;
 
-    while (Core::System::GetInstance().IsPoweredOn()) {
+    dma_state.is_last_call = true;
+
+    while (system.IsPoweredOn()) {
         if (!Step()) {
             break;
         }
     }
     gpu.FlushCommands();
+    gpu.SyncGuestHost();
+    gpu.OnCommandListEnd();
 }
 
 bool DmaPusher::Step() {
@@ -40,44 +46,59 @@ bool DmaPusher::Step() {
         return false;
     }
 
-    const CommandList& command_list{dma_pushbuffer.front()};
-    ASSERT_OR_EXECUTE(!command_list.empty(), {
-        // Somehow the command_list is empty, in order to avoid a crash
-        // We ignore it and assume its size is 0.
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-        return true;
-    });
-    const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
-    GPUVAddr dma_get = command_list_header.addr;
-    GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32);
-    bool non_main = command_list_header.is_non_main;
-
-    if (dma_pushbuffer_subindex >= command_list.size()) {
-        // We've gone through the current list, remove it from the queue
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-    }
+    CommandList& command_list{dma_pushbuffer.front()};
 
-    if (command_list_header.size == 0) {
-        return true;
-    }
+    ASSERT_OR_EXECUTE(
+        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
+            // Somehow the command_list is empty, in order to avoid a crash
+            // We ignore it and assume its size is 0.
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+            return true;
+        });
 
-    // Push buffer non-empty, read a word
-    command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
-                                        command_list_header.size * sizeof(u32));
+    if (command_list.prefetch_command_list.size()) {
+        // Prefetched command list from nvdrv, used for things like synchronization
+        command_headers = std::move(command_list.prefetch_command_list);
+        dma_pushbuffer.pop();
+    } else {
+        const CommandListHeader command_list_header{
+            command_list.command_lists[dma_pushbuffer_subindex++]};
+        const GPUVAddr dma_get = command_list_header.addr;
+
+        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
 
-    for (const CommandHeader& command_header : command_headers) {
+        if (command_list_header.size == 0) {
+            return true;
+        }
 
-        // now, see if we're in the middle of a command
-        if (dma_state.length_pending) {
-            // Second word of long non-inc methods command - method count
-            dma_state.length_pending = 0;
-            dma_state.method_count = command_header.method_count_;
-        } else if (dma_state.method_count) {
+        // Push buffer non-empty, read a word
+        command_headers.resize(command_list_header.size);
+        gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                            command_list_header.size * sizeof(u32));
+    }
+    for (std::size_t index = 0; index < command_headers.size();) {
+        const CommandHeader& command_header = command_headers[index];
+
+        if (dma_state.method_count) {
             // Data word of methods command
-            CallMethod(command_header.argument);
+            if (dma_state.non_incrementing) {
+                const u32 max_write = static_cast<u32>(
+                    std::min<std::size_t>(index + dma_state.method_count, command_headers.size()) -
+                    index);
+                CallMultiMethod(&command_header.argument, max_write);
+                dma_state.method_count -= max_write;
+                dma_state.is_last_call = true;
+                index += max_write;
+                continue;
+            } else {
+                dma_state.is_last_call = dma_state.method_count <= 1;
+                CallMethod(command_header.argument);
+            }
 
             if (!dma_state.non_incrementing) {
                 dma_state.method++;
@@ -117,11 +138,7 @@ bool DmaPusher::Step() {
                 break;
             }
         }
-    }
-
-    if (!non_main) {
-        // TODO (degasus): This is dead code, as dma_mget is never read.
-        dma_mget = dma_put;
+        index++;
     }
 
     return true;
@@ -134,7 +151,22 @@ void DmaPusher::SetState(const CommandHeader& command_header) {
 }
 
 void DmaPusher::CallMethod(u32 argument) const {
-    gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count});
+    if (dma_state.method < non_puller_methods) {
+        gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count});
+    } else {
+        subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
+                                                      dma_state.is_last_call);
+    }
+}
+
+void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
+    if (dma_state.method < non_puller_methods) {
+        gpu.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
+                            dma_state.method_count);
+    } else {
+        subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
+                                                           num_methods, dma_state.method_count);
+    }
 }
 
 } // namespace Tegra
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 6ab06518f..96ac267f7 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -4,14 +4,22 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include <queue>
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+
+namespace Core {
+class System;
+}
 
 namespace Tegra {
 
+class GPU;
+
 enum class SubmissionMode : u32 {
     IncreasingOld = 0,
     Increasing = 1,
@@ -21,6 +29,31 @@ enum class SubmissionMode : u32 {
     IncreaseOnce = 5
 };
 
+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
+enum class BufferMethods : u32 {
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    UnkCacheFlush = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
+    WaitForInterrupt = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
 struct CommandListHeader {
     union {
         u64 raw;
@@ -43,9 +76,23 @@ union CommandHeader {
 static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
 
-class GPU;
-
-using CommandList = std::vector<Tegra::CommandListHeader>;
+inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
+    CommandHeader result{};
+    result.method.Assign(static_cast<u32>(method));
+    result.arg_count.Assign(arg_count);
+    result.mode.Assign(mode);
+    return result;
+}
+
+struct CommandList final {
+    CommandList() = default;
+    explicit CommandList(std::size_t size) : command_lists(size) {}
+    explicit CommandList(std::vector<Tegra::CommandHeader>&& prefetch_command_list)
+        : prefetch_command_list{std::move(prefetch_command_list)} {}
+
+    std::vector<Tegra::CommandListHeader> command_lists;
+    std::vector<Tegra::CommandHeader> prefetch_command_list;
+};
 
 /**
  * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
@@ -54,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>;
  * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
  * details on this implementation.
  */
-class DmaPusher {
+class DmaPusher final {
 public:
-    explicit DmaPusher(GPU& gpu);
+    explicit DmaPusher(Core::System& system, GPU& gpu);
     ~DmaPusher();
 
     void Push(CommandList&& entries) {
@@ -65,14 +112,19 @@ public:
 
     void DispatchCalls();
 
+    void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) {
+        subchannels[subchannel_id] = engine;
+    }
+
 private:
+    static constexpr u32 non_puller_methods = 0x40;
+    static constexpr u32 max_subchannels = 8;
     bool Step();
 
     void SetState(const CommandHeader& command_header);
 
     void CallMethod(u32 argument) const;
-
-    GPU& gpu;
+    void CallMultiMethod(const u32* base_start, u32 num_methods) const;
 
     std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
 
@@ -85,13 +137,18 @@ private:
         u32 method_count;      ///< Current method count
         u32 length_pending;    ///< Large NI command length pending
         bool non_incrementing; ///< Current command's NI flag
+        bool is_last_call;
     };
 
     DmaState dma_state{};
     bool dma_increment_once{};
 
-    GPUVAddr dma_mget{};  ///< main pushbuffer last read address
     bool ib_enable{true}; ///< IB mode enabled
+
+    std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{};
+
+    GPU& gpu;
+    Core::System& system;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
     virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
     virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                     u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
     virtual u32 GetBoundBuffer() const = 0;
 
     virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
new file mode 100644
index 000000000..18a9db7e6
--- /dev/null
+++ b/src/video_core/engines/engine_interface.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+class EngineInterface {
+public:
+    /// Write the value to the register identified by method.
+    virtual void CallMethod(u32 method, u32 method_argument, bool is_last_call) = 0;
+
+    /// Write multiple values to the register identified by method.
+    virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) = 0;
+};
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 85d308e26..9409c4075 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,15 +10,21 @@
 
 namespace Tegra::Engines {
 
-Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
+Fermi2D::Fermi2D() = default;
 
-void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
+Fermi2D::~Fermi2D() = default;
+
+void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid Fermi2D register, increase the size of the Regs structure");
 
-    regs.reg_array[method_call.method] = method_call.argument;
+    regs.reg_array[method] = method_argument;
 
-    switch (method_call.method) {
+    switch (method) {
     // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit,
     // so trigger on the second 32-bit write.
     case FERMI2D_REG_INDEX(blit_src_y) + 1: {
@@ -28,7 +34,13 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
     }
 }
 
-std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
+void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
     const u32 line_a = src_2 - src_1;
     const u32 line_b = dst_2 - dst_1;
     const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
@@ -75,13 +87,13 @@ void Fermi2D::HandleSurfaceCopy() {
     const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
     const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
                                           dst_blit_y2};
-    Config copy_config;
-    copy_config.operation = regs.operation;
-    copy_config.filter = regs.blit_control.filter;
-    copy_config.src_rect = src_rect;
-    copy_config.dst_rect = dst_rect;
-
-    if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
+    const Config copy_config{
+        .operation = regs.operation,
+        .filter = regs.blit_control.filter,
+        .src_rect = src_rect,
+        .dst_rect = dst_rect,
+    };
+    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
         UNIMPLEMENTED();
     }
 }
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index dba342c70..0909709ec 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -10,6 +10,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/engine_interface.h"
 #include "video_core/gpu.h"
 
 namespace Tegra {
@@ -31,13 +32,20 @@ namespace Tegra::Engines {
 #define FERMI2D_REG_INDEX(field_name)                                                              \
     (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
 
-class Fermi2D final {
+class Fermi2D final : public EngineInterface {
 public:
-    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
-    ~Fermi2D() = default;
+    explicit Fermi2D();
+    ~Fermi2D();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
 
     /// Write the value to the register identified by method.
-    void CallMethod(const GPU::MethodCall& method_call);
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
 
     enum class Origin : u32 {
         Center = 0,
@@ -137,14 +145,14 @@ public:
     } regs{};
 
     struct Config {
-        Operation operation;
-        Filter filter;
+        Operation operation{};
+        Filter filter{};
         Common::Rectangle<u32> src_rect;
         Common::Rectangle<u32> dst_rect;
     };
 
 private:
-    VideoCore::RasterizerInterface& rasterizer;
+    VideoCore::RasterizerInterface* rasterizer;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 368c75a66..898370739 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -16,28 +16,28 @@
 
 namespace Tegra::Engines {
 
-KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                             MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{
-                                                                                  memory_manager,
-                                                                                  regs.upload} {}
+KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
 
 KeplerCompute::~KeplerCompute() = default;
 
-void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
+void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid KeplerCompute register, increase the size of the Regs structure");
 
-    regs.reg_array[method_call.method] = method_call.argument;
+    regs.reg_array[method] = method_argument;
 
-    switch (method_call.method) {
+    switch (method) {
     case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
         upload_state.ProcessExec(regs.exec_upload.linear != 0);
         break;
     }
     case KEPLER_COMPUTE_REG_INDEX(data_upload): {
-        const bool is_last_call = method_call.IsLastCall();
-        upload_state.ProcessData(method_call.argument, is_last_call);
+        upload_state.ProcessData(method_argument, is_last_call);
         if (is_last_call) {
             system.GPU().Maxwell3D().OnMemoryWrite();
         }
@@ -51,6 +51,13 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
     }
 }
 
+void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                    u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
 Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
     const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
     ASSERT(cbuf_mask[regs.tex_cb_index]);
@@ -86,8 +93,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
     ASSERT(stage == ShaderType::Compute);
     const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
@@ -95,11 +105,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
 }
 
 VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() {
-    return rasterizer.AccessGuestDriverProfile();
+    return rasterizer->AccessGuestDriverProfile();
 }
 
 const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const {
-    return rasterizer.AccessGuestDriverProfile();
+    return rasterizer->AccessGuestDriverProfile();
 }
 
 void KeplerCompute::ProcessLaunch() {
@@ -110,7 +120,7 @@ void KeplerCompute::ProcessLaunch() {
     const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
     LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
 
-    rasterizer.DispatchCompute(code_addr);
+    rasterizer->DispatchCompute(code_addr);
 }
 
 Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index eeb79c56f..7f2500aab 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -11,6 +11,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
+#include "video_core/engines/engine_interface.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
@@ -39,12 +40,14 @@ namespace Tegra::Engines {
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
     (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
 
-class KeplerCompute final : public ConstBufferEngineInterface {
+class KeplerCompute final : public ConstBufferEngineInterface, public EngineInterface {
 public:
-    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                           MemoryManager& memory_manager);
+    explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
     ~KeplerCompute();
 
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
     static constexpr std::size_t NumConstBuffers = 8;
 
     struct Regs {
@@ -200,7 +203,11 @@ public:
                   "KeplerCompute LaunchParams has wrong size");
 
     /// Write the value to the register identified by method.
-    void CallMethod(const GPU::MethodCall& method_call);
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
 
     Texture::FullTextureInfo GetTexture(std::size_t offset) const;
 
@@ -214,6 +221,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
@@ -223,11 +232,6 @@ public:
     const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
 
 private:
-    Core::System& system;
-    VideoCore::RasterizerInterface& rasterizer;
-    MemoryManager& memory_manager;
-    Upload::State upload_state;
-
     void ProcessLaunch();
 
     /// Retrieves information about a specific TIC entry from the TIC buffer.
@@ -235,6 +239,11 @@ private:
 
     /// Retrieves information about a specific TSC entry from the TSC buffer.
     Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
+
+    Core::System& system;
+    MemoryManager& memory_manager;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    Upload::State upload_state;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 597872e43..dc71b2eec 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -19,20 +19,19 @@ KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
 
 KeplerMemory::~KeplerMemory() = default;
 
-void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
+void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid KeplerMemory register, increase the size of the Regs structure");
 
-    regs.reg_array[method_call.method] = method_call.argument;
+    regs.reg_array[method] = method_argument;
 
-    switch (method_call.method) {
+    switch (method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
         upload_state.ProcessExec(regs.exec.linear != 0);
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        const bool is_last_call = method_call.IsLastCall();
-        upload_state.ProcessData(method_call.argument, is_last_call);
+        upload_state.ProcessData(method_argument, is_last_call);
         if (is_last_call) {
             system.GPU().Maxwell3D().OnMemoryWrite();
         }
@@ -41,4 +40,11 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
     }
 }
 
+void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                   u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 396fb6e86..5b7f71a00 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -10,6 +10,7 @@
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
@@ -32,13 +33,17 @@ namespace Tegra::Engines {
 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
     (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
 
-class KeplerMemory final {
+class KeplerMemory final : public EngineInterface {
 public:
     KeplerMemory(Core::System& system, MemoryManager& memory_manager);
     ~KeplerMemory();
 
     /// Write the value to the register identified by method.
-    void CallMethod(const GPU::MethodCall& method_call);
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
 
     struct Regs {
         static constexpr size_t NUM_REGS = 0x7F;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index ba63b44b4..6287df633 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,15 +22,19 @@ using VideoCore::QueryType;
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;
 
-Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                     MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)},
+      upload_state{memory_manager, regs.upload} {
     dirty.flags.flip();
-
     InitializeRegisterDefaults();
 }
 
+Maxwell3D::~Maxwell3D() = default;
+
+void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
 void Maxwell3D::InitializeRegisterDefaults() {
     // Initializes registers to their default values - what games expect them to be at boot. This is
     // for certain registers that may not be explicitly set by games.
@@ -44,6 +48,12 @@ void Maxwell3D::InitializeRegisterDefaults() {
         viewport.depth_range_near = 0.0f;
         viewport.depth_range_far = 1.0f;
     }
+    for (auto& viewport : regs.viewport_transform) {
+        viewport.swizzle.x.Assign(Regs::ViewportSwizzle::PositiveX);
+        viewport.swizzle.y.Assign(Regs::ViewportSwizzle::PositiveY);
+        viewport.swizzle.z.Assign(Regs::ViewportSwizzle::PositiveZ);
+        viewport.swizzle.w.Assign(Regs::ViewportSwizzle::PositiveW);
+    }
 
     // Doom and Bomberman seems to use the uninitialized registers and just enable blend
     // so initialize blend registers with sane values
@@ -92,11 +102,19 @@ void Maxwell3D::InitializeRegisterDefaults() {
         color_mask.A.Assign(1);
     }
 
+    for (auto& format : regs.vertex_attrib_format) {
+        format.constant.Assign(1);
+    }
+
     // NVN games expect these values to be enabled at boot
     regs.rasterize_enable = 1;
     regs.rt_separate_frag_data = 1;
     regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
     regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
 
     shadow_state = regs;
 
@@ -106,7 +124,113 @@ void Maxwell3D::InitializeRegisterDefaults() {
     mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }
 
-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    }
+}
+
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
     // Reset the current macro.
     executing_macro = 0;
 
@@ -115,18 +239,16 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
 }
 
-void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
-    const u32 method = method_call.method;
-
+void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     if (method == cb_data_state.current) {
-        regs.reg_array[method] = method_call.argument;
-        ProcessCBData(method_call.argument);
+        regs.reg_array[method] = method_argument;
+        ProcessCBData(method_argument);
         return;
     } else if (cb_data_state.current != null_cb_data) {
         FinishCBData();
@@ -141,61 +263,27 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
     // Methods after 0xE00 are special, they're actually triggers for some microcode that was
     // uploaded to the GPU during initialization.
     if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        macro_params.push_back(method_call.argument);
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (method_call.IsLastCall()) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
-            macro_params.clear();
-        }
+        ProcessMacro(method, &method_argument, 1, is_last_call);
         return;
     }
 
     ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid Maxwell3D register, increase the size of the Regs structure");
 
-    u32 arg = method_call.argument;
-    // Keep track of the register value in shadow_state when requested.
-    if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track ||
-        shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) {
-        shadow_state.reg_array[method] = arg;
-    } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) {
-        arg = shadow_state.reg_array[method];
-    }
-
-    if (regs.reg_array[method] != arg) {
-        regs.reg_array[method] = arg;
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
+}
 
-        for (const auto& table : dirty.tables) {
-            dirty.flags[table[method]] = true;
-        }
+void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                u32 methods_pending) {
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
+        return;
     }
-
     switch (method) {
-    case MAXWELL3D_REG_INDEX(shadow_ram_control): {
-        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_call.argument);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.data): {
-        ProcessMacroUpload(arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.bind): {
-        ProcessMacroBind(arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(firmware[4]): {
-        ProcessFirmwareCall4();
-        break;
-    }
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
@@ -211,67 +299,13 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        StartCBData(method);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[0]): {
-        ProcessCBBind(0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[1]): {
-        ProcessCBBind(1);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[2]): {
-        ProcessCBBind(2);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[3]): {
-        ProcessCBBind(3);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[4]): {
-        ProcessCBBind(4);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
-        DrawArrays();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(clear_buffers): {
-        ProcessClearBuffers();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(query.query_get): {
-        ProcessQueryGet();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(condition.mode): {
-        ProcessQueryCondition();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(counter_reset): {
-        ProcessCounterReset();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(sync_info): {
-        ProcessSyncPoint();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(exec_upload): {
-        upload_state.ProcessExec(regs.exec_upload.linear != 0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(data_upload): {
-        const bool is_last_call = method_call.IsLastCall();
-        upload_state.ProcessData(arg, is_last_call);
-        if (is_last_call) {
-            OnMemoryWrite();
-        }
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        ProcessCBMultiData(method, base_start, amount);
         break;
-    }
     default:
+        for (std::size_t i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        }
         break;
     }
 }
@@ -300,16 +334,15 @@ void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) {
     StepInstance(expected_mode, count);
 }
 
-void Maxwell3D::CallMethodFromMME(const GPU::MethodCall& method_call) {
-    const u32 method = method_call.method;
+void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
     if (mme_inline[method]) {
-        regs.reg_array[method] = method_call.argument;
+        regs.reg_array[method] = method_argument;
         if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ||
             method == MAXWELL3D_REG_INDEX(index_array.count)) {
             const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count)
                                                   ? MMEDrawMode::Array
                                                   : MMEDrawMode::Indexed;
-            StepInstance(expected_mode, method_call.argument);
+            StepInstance(expected_mode, method_argument);
         } else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) {
             mme_draw.instance_mode =
                 (regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0);
@@ -321,7 +354,7 @@ void Maxwell3D::CallMethodFromMME(const GPU::MethodCall& method_call) {
         if (mme_draw.current_mode != MMEDrawMode::Undefined) {
             FlushMMEInlineDraw();
         }
-        CallMethod(method_call);
+        CallMethod(method, method_argument, true);
     }
 }
 
@@ -337,7 +370,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 
     const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
     if (ShouldExecute()) {
-        rasterizer.Draw(is_indexed, true);
+        rasterizer->Draw(is_indexed, true);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -358,9 +391,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 }
 
 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
-               "upload_address exceeded macro_memory size!");
-    macro_memory[regs.macros.upload_address++] = data;
+    macro_engine->AddCode(regs.macros.upload_address++, data);
 }
 
 void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -395,12 +426,17 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 
 void Maxwell3D::ProcessQueryGet() {
     // TODO(Subv): Support the other query units.
-    ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
-               "Units other than CROP are unimplemented");
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }
 
     switch (regs.query.query_get.operation) {
     case Regs::QueryOperation::Release:
-        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
+        if (regs.query.query_get.fence == 1) {
+            rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence);
+        } else {
+            StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
+        }
         break;
     case Regs::QueryOperation::Acquire:
         // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
@@ -465,11 +501,11 @@ void Maxwell3D::ProcessQueryCondition() {
 void Maxwell3D::ProcessCounterReset() {
     switch (regs.counter_reset) {
     case Regs::CounterReset::SampleCnt:
-        rasterizer.ResetCounter(QueryType::SamplesPassed);
+        rasterizer->ResetCounter(QueryType::SamplesPassed);
         break;
     default:
-        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
-                    static_cast<int>(regs.counter_reset));
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
+                  static_cast<int>(regs.counter_reset));
         break;
     }
 }
@@ -479,7 +515,7 @@ void Maxwell3D::ProcessSyncPoint() {
     const u32 increment = regs.sync_info.increment.Value();
     [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
     if (increment) {
-        system.GPU().IncrementSyncPoint(sync_point);
+        rasterizer->SignalSyncPoint(sync_point);
     }
 }
 
@@ -502,7 +538,7 @@ void Maxwell3D::DrawArrays() {
 
     const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
     if (ShouldExecute()) {
-        rasterizer.Draw(is_indexed, false);
+        rasterizer->Draw(is_indexed, false);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -522,12 +558,12 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
         return 0;
     case Regs::QuerySelect::SamplesPassed:
         // Deferred.
-        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
-                         system.GPU().GetTicks());
-        return {};
+        rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+                          system.GPU().GetTicks());
+        return std::nullopt;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                          static_cast<u32>(regs.query.query_get.select.Value()));
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  static_cast<u32>(regs.query.query_get.select.Value()));
         return 1;
     }
 }
@@ -562,6 +598,28 @@ void Maxwell3D::StartCBData(u32 method) {
     ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
 }
 
+void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) {
+    if (cb_data_state.current != method) {
+        if (cb_data_state.current != null_cb_data) {
+            FinishCBData();
+        }
+        constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+        cb_data_state.start_pos = regs.const_buffer.cb_pos;
+        cb_data_state.id = method - first_cb_data;
+        cb_data_state.current = method;
+        cb_data_state.counter = 0;
+    }
+    const std::size_t id = cb_data_state.id;
+    const std::size_t size = amount;
+    std::size_t i = 0;
+    for (; i < size; i++) {
+        cb_data_state.buffer[id][cb_data_state.counter] = start_base[i];
+        cb_data_state.counter++;
+    }
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount;
+}
+
 void Maxwell3D::FinishCBData() {
     // Write the input value to the current const buffer at the current position.
     const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
@@ -628,7 +686,7 @@ void Maxwell3D::ProcessClearBuffers() {
            regs.clear_buffers.R == regs.clear_buffers.B &&
            regs.clear_buffers.R == regs.clear_buffers.A);
 
-    rasterizer.Clear();
+    rasterizer->Clear();
 }
 
 u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
@@ -650,8 +708,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
     const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& tex_info_buffer = shader.const_buffers[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
@@ -659,11 +720,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
 }
 
 VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() {
-    return rasterizer.AccessGuestDriverProfile();
+    return rasterizer->AccessGuestDriverProfile();
 }
 
 const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const {
-    return rasterizer.AccessGuestDriverProfile();
+    return rasterizer->AccessGuestDriverProfile();
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 5cf6a4cc3..1cbe8fe67 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -19,10 +19,11 @@
 #include "common/math_util.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/const_buffer_info.h"
+#include "video_core/engines/engine_interface.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
 #include "video_core/textures/texture.h"
 
 namespace Core {
@@ -48,11 +49,13 @@ namespace Tegra::Engines {
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
     (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
 
-class Maxwell3D final : public ConstBufferEngineInterface {
+class Maxwell3D final : public ConstBufferEngineInterface, public EngineInterface {
 public:
-    explicit Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                       MemoryManager& memory_manager);
-    ~Maxwell3D() = default;
+    explicit Maxwell3D(Core::System& system, MemoryManager& memory_manager);
+    ~Maxwell3D();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
 
     /// Register structure of the Maxwell3D engine.
     /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
@@ -575,6 +578,17 @@ public:
             Replay = 3,
         };
 
+        enum class ViewportSwizzle : u32 {
+            PositiveX = 0,
+            NegativeX = 1,
+            PositiveY = 2,
+            NegativeY = 3,
+            PositiveZ = 4,
+            NegativeZ = 5,
+            PositiveW = 6,
+            NegativeW = 7,
+        };
+
         struct RenderTargetConfig {
             u32 address_high;
             u32 address_low;
@@ -586,6 +600,7 @@ public:
                 BitField<4, 3, u32> block_height;
                 BitField<8, 3, u32> block_depth;
                 BitField<12, 1, InvMemoryLayout> type;
+                BitField<16, 1, u32> is_3d;
             } memory_layout;
             union {
                 BitField<0, 16, u32> layers;
@@ -618,7 +633,14 @@ public:
             f32 translate_x;
             f32 translate_y;
             f32 translate_z;
-            INSERT_UNION_PADDING_WORDS(2);
+            union {
+                u32 raw;
+                BitField<0, 3, ViewportSwizzle> x;
+                BitField<4, 3, ViewportSwizzle> y;
+                BitField<8, 3, ViewportSwizzle> z;
+                BitField<12, 3, ViewportSwizzle> w;
+            } swizzle;
+            INSERT_UNION_PADDING_WORDS(1);
 
             Common::Rectangle<f32> GetRect() const {
                 return {
@@ -627,7 +649,7 @@ public:
                     GetX() + GetWidth(),  // right
                     GetY()                // bottom
                 };
-            };
+            }
 
             f32 GetX() const {
                 return std::max(0.0f, translate_x - std::fabs(scale_x));
@@ -709,7 +731,9 @@ public:
 
         union {
             struct {
-                INSERT_UNION_PADDING_WORDS(0x45);
+                INSERT_UNION_PADDING_WORDS(0x44);
+
+                u32 wait_for_idle;
 
                 struct {
                     u32 upload_address;
@@ -1149,7 +1173,7 @@ public:
 
                     /// Returns whether the vertex array specified by index is supposed to be
                     /// accessed per instance or not.
-                    bool IsInstancingEnabled(u32 index) const {
+                    bool IsInstancingEnabled(std::size_t index) const {
                         return is_instanced[index];
                     }
                 } instanced_arrays;
@@ -1179,6 +1203,7 @@ public:
                     BitField<0, 1, u32> depth_range_0_1;
                     BitField<3, 1, u32> depth_clamp_near;
                     BitField<4, 1, u32> depth_clamp_far;
+                    BitField<11, 1, u32> depth_clamp_disabled;
                 } view_volume_clip_control;
 
                 INSERT_UNION_PADDING_WORDS(0x1F);
@@ -1259,7 +1284,8 @@ public:
 
                     GPUVAddr LimitAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
-                                                     limit_low);
+                                                     limit_low) +
+                               1;
                     }
                 } vertex_array_limit[NumVertexArrays];
 
@@ -1356,10 +1382,14 @@ public:
     u32 GetRegisterValue(u32 method) const;
 
     /// Write the value to the register identified by method.
-    void CallMethod(const GPU::MethodCall& method_call);
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
 
     /// Write the value to the register identified by method.
-    void CallMethodFromMME(const GPU::MethodCall& method_call);
+    void CallMethodFromMME(u32 method, u32 method_argument);
 
     void FlushMMEInlineDraw();
 
@@ -1376,6 +1406,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
@@ -1384,17 +1416,16 @@ public:
 
     const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
 
-    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
-    /// we've seen used.
-    using MacroMemory = std::array<u32, 0x40000>;
+    bool ShouldExecute() const {
+        return execute_on;
+    }
 
-    /// Gets a reference to macro memory.
-    const MacroMemory& GetMacroMemory() const {
-        return macro_memory;
+    VideoCore::RasterizerInterface& Rasterizer() {
+        return *rasterizer;
     }
 
-    bool ShouldExecute() const {
-        return execute_on;
+    const VideoCore::RasterizerInterface& Rasterizer() const {
+        return *rasterizer;
     }
 
     /// Notify a memory write has happened.
@@ -1430,27 +1461,31 @@ public:
 private:
     void InitializeRegisterDefaults();
 
-    Core::System& system;
+    void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call);
 
-    VideoCore::RasterizerInterface& rasterizer;
+    u32 ProcessShadowRam(u32 method, u32 argument);
 
+    void ProcessDirtyRegisters(u32 method, u32 argument);
+
+    void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
+
+    Core::System& system;
     MemoryManager& memory_manager;
 
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+
     /// Start offsets of each macro in macro_memory
     std::array<u32, 0x80> macro_positions = {};
 
     std::array<bool, Regs::NUM_REGS> mme_inline{};
 
-    /// Memory for macro code
-    MacroMemory macro_memory;
-
     /// Macro method that is currently being executed / being fed parameters.
     u32 executing_macro = 0;
     /// Parameters that have been submitted to the macro call so far.
     std::vector<u32> macro_params;
 
     /// Interpreter for the macro codes uploaded to the GPU.
-    MacroInterpreter macro_interpreter;
+    std::unique_ptr<MacroEngine> macro_engine;
 
     static constexpr u32 null_cb_data = 0xFFFFFFFF;
     struct {
@@ -1479,7 +1514,7 @@ private:
      * @param num_parameters Number of arguments
      * @param parameters Arguments to the method call
      */
-    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
 
     /// Handles writes to the macro uploading register.
     void ProcessMacroUpload(u32 data);
@@ -1511,6 +1546,7 @@ private:
     /// Handles a write to the CB_DATA[i] register.
     void StartCBData(u32 method);
     void ProcessCBData(u32 value);
+    void ProcessCBMultiData(u32 method, const u32* start_base, u32 amount);
     void FinishCBData();
 
     /// Handles a write to the CB_BIND register.
@@ -1530,6 +1566,7 @@ private:
     static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4,                           \
                   "Field " #field_name " has invalid position")
 
+ASSERT_REG_POSITION(wait_for_idle, 0x44);
 ASSERT_REG_POSITION(macros, 0x45);
 ASSERT_REG_POSITION(shadow_ram_control, 0x49);
 ASSERT_REG_POSITION(upload, 0x60);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index c2610f992..8fa359d0a 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -14,43 +14,45 @@
 
 namespace Tegra::Engines {
 
+using namespace Texture;
+
 MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
     : system{system}, memory_manager{memory_manager} {}
 
-void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
-               "Invalid MaxwellDMA register, increase the size of the Regs structure");
-
-    regs.reg_array[method_call.method] = method_call.argument;
+void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
 
-#define MAXWELLDMA_REG_INDEX(field_name)                                                           \
-    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32))
+    regs.reg_array[method] = method_argument;
 
-    switch (method_call.method) {
-    case MAXWELLDMA_REG_INDEX(exec): {
-        HandleCopy();
-        break;
-    }
+    if (method == offsetof(Regs, launch_dma) / sizeof(u32)) {
+        Launch();
     }
-
-#undef MAXWELLDMA_REG_INDEX
 }
 
-void MaxwellDMA::HandleCopy() {
-    LOG_TRACE(HW_GPU, "Requested a DMA copy");
+void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) {
+    for (size_t i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
 
-    const GPUVAddr source = regs.src_address.Address();
-    const GPUVAddr dest = regs.dst_address.Address();
+void MaxwellDMA::Launch() {
+    LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in),
+              static_cast<GPUVAddr>(regs.offset_out));
 
     // TODO(Subv): Perform more research and implement all features of this engine.
-    ASSERT(regs.exec.enable_swizzle == 0);
-    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
-    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
-    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
-    ASSERT(regs.dst_params.pos_x == 0);
-    ASSERT(regs.dst_params.pos_y == 0);
-
-    if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+    const LaunchDMA& launch = regs.launch_dma;
+    ASSERT(launch.remap_enable == 0);
+    ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
+    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
+    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
+    ASSERT(regs.dst_params.origin.x == 0);
+    ASSERT(regs.dst_params.origin.y == 0);
+
+    const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+    const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+
+    if (!is_src_pitch && !is_dst_pitch) {
         // If both the source and the destination are in block layout, assert.
         UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
         return;
@@ -59,99 +61,154 @@ void MaxwellDMA::HandleCopy() {
     // All copies here update the main memory, so mark all rasterizer states as invalid.
     system.GPU().Maxwell3D().OnMemoryWrite();
 
-    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
-        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
-        // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
-        // y_count).
-        if (!regs.exec.enable_2d) {
-            memory_manager.CopyBlock(dest, source, regs.x_count);
-            return;
-        }
+    if (is_src_pitch && is_dst_pitch) {
+        CopyPitchToPitch();
+    } else {
+        ASSERT(launch.multi_line_enable == 1);
 
-        // If both the source and the destination are in linear layout, perform a line-by-line
-        // copy. We're going to take a subrect of size (x_count, y_count) from the source
-        // rectangle. There is no need to manually flush/invalidate the regions because
-        // CopyBlock does that for us.
-        for (u32 line = 0; line < regs.y_count; ++line) {
-            const GPUVAddr source_line = source + line * regs.src_pitch;
-            const GPUVAddr dest_line = dest + line * regs.dst_pitch;
-            memory_manager.CopyBlock(dest_line, source_line, regs.x_count);
+        if (!is_src_pitch && is_dst_pitch) {
+            CopyBlockLinearToPitch();
+        } else {
+            CopyPitchToBlockLinear();
         }
+    }
+}
+
+void MaxwellDMA::CopyPitchToPitch() {
+    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
+    // buffer of length `line_length_in`.
+    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
+    if (!regs.launch_dma.multi_line_enable) {
+        memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in);
         return;
     }
 
-    ASSERT(regs.exec.enable_2d == 1);
+    // Perform a line-by-line copy.
+    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
+    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
+    for (u32 line = 0; line < regs.line_count; ++line) {
+        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+    }
+}
 
-    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.BlockDepth() == 0);
-        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
-        const std::size_t src_size = Texture::CalculateSize(
-            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
-            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+void MaxwellDMA::CopyBlockLinearToPitch() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
+    UNIMPLEMENTED_IF(regs.src_params.layer != 0);
 
-        const std::size_t src_layer_size = Texture::CalculateSize(
-            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
-            regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+    // Optimized path for micro copies.
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) {
+        FastCopyBlockLinearToPitch();
+        return;
+    }
 
-        const std::size_t dst_size = regs.dst_pitch * regs.y_count;
+    // Deswizzle the input and copy it over.
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const Parameters& src_params = regs.src_params;
+    const u32 width = src_params.width;
+    const u32 height = src_params.height;
+    const u32 depth = src_params.depth;
+    const u32 block_height = src_params.block_size.height;
+    const u32 block_depth = src_params.block_size.depth;
+    const size_t src_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
 
-        if (read_buffer.size() < src_size) {
-            read_buffer.resize(src_size);
-        }
+    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
 
-        if (write_buffer.size() < dst_size) {
-            write_buffer.resize(dst_size);
-        }
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
+                     block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
+                     read_buffer.data());
 
-        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
-        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
 
-        Texture::UnswizzleSubrect(
-            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
-            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
-            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
+void MaxwellDMA::CopyPitchToBlockLinear() {
+    const auto& dst_params = regs.dst_params;
+    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
+    const u32 width = dst_params.width;
+    const u32 height = dst_params.height;
+    const u32 depth = dst_params.depth;
+    const u32 block_height = dst_params.block_size.height;
+    const u32 block_depth = dst_params.block_size.depth;
+    const size_t dst_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+    const size_t dst_layer_size =
+        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
+
+    const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
 
-        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
     } else {
-        ASSERT(regs.dst_params.BlockDepth() == 0);
-
-        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
-
-        const std::size_t dst_size = Texture::CalculateSize(
-            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
-            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
-
-        const std::size_t dst_layer_size = Texture::CalculateSize(
-            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
-            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
+        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
 
-        const std::size_t src_size = regs.src_pitch * regs.y_count;
+    // If the input is linear and the output is tiled, swizzle the input and copy it over.
+    if (regs.dst_params.block_size.depth > 0) {
+        ASSERT(dst_params.layer == 0);
+        SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
+                            bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
+                            dst_params.origin.y, write_buffer.data(), read_buffer.data());
+    } else {
+        SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
+                       write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
+                       block_height, dst_params.origin.x, dst_params.origin.y);
+    }
 
-        if (read_buffer.size() < src_size) {
-            read_buffer.resize(src_size);
-        }
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
 
-        if (write_buffer.size() < dst_size) {
-            write_buffer.resize(dst_size);
-        }
+void MaxwellDMA::FastCopyBlockLinearToPitch() {
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const size_t src_size = GOB_SIZE;
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    u32 pos_x = regs.src_params.origin.x;
+    u32 pos_y = regs.src_params.origin.y;
+    const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
+                                    regs.src_params.block_size.height, bytes_per_pixel);
+    const u32 x_in_gob = 64 / bytes_per_pixel;
+    pos_x = pos_x % x_in_gob;
+    pos_y = pos_y % 8;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
 
-        if (Settings::values.use_accurate_gpu_emulation) {
-            memory_manager.ReadBlock(source, read_buffer.data(), src_size);
-            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
-        } else {
-            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
-            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
-        }
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    } else {
+        memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
 
-        // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::SwizzleSubrect(
-            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
-            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
-            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
+                     bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
+                     write_buffer.data(), read_buffer.data());
 
-        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
-    }
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 4f40d1d1f..50f445efc 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -10,6 +10,7 @@
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
 #include "video_core/gpu.h"
 
 namespace Core {
@@ -23,156 +24,190 @@ class MemoryManager;
 namespace Tegra::Engines {
 
 /**
- * This Engine is known as GK104_Copy. Documentation can be found in:
+ * This engine is known as gk104_copy. Documentation can be found in:
+ * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
  * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
  */
 
-class MaxwellDMA final {
+class MaxwellDMA final : public EngineInterface {
 public:
-    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
-    ~MaxwellDMA() = default;
-
-    /// Write the value to the register identified by method.
-    void CallMethod(const GPU::MethodCall& method_call);
+    struct PackedGPUVAddr {
+        u32 upper;
+        u32 lower;
+
+        constexpr operator GPUVAddr() const noexcept {
+            return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower;
+        }
+    };
+
+    union BlockSize {
+        BitField<0, 4, u32> width;
+        BitField<4, 4, u32> height;
+        BitField<8, 4, u32> depth;
+        BitField<12, 4, u32> gob_height;
+    };
+    static_assert(sizeof(BlockSize) == 4);
+
+    union Origin {
+        BitField<0, 16, u32> x;
+        BitField<16, 16, u32> y;
+    };
+    static_assert(sizeof(Origin) == 4);
+
+    struct Parameters {
+        BlockSize block_size;
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 layer;
+        Origin origin;
+    };
+    static_assert(sizeof(Parameters) == 24);
+
+    struct Semaphore {
+        PackedGPUVAddr address;
+        u32 payload;
+    };
+    static_assert(sizeof(Semaphore) == 12);
+
+    struct RenderEnable {
+        enum class Mode : u32 {
+            FALSE = 0,
+            TRUE = 1,
+            CONDITIONAL = 2,
+            RENDER_IF_EQUAL = 3,
+            RENDER_IF_NOT_EQUAL = 4,
+        };
 
-    struct Regs {
-        static constexpr std::size_t NUM_REGS = 0x1D6;
+        PackedGPUVAddr address;
+        BitField<0, 3, Mode> mode;
+    };
+    static_assert(sizeof(RenderEnable) == 12);
+
+    enum class PhysModeTarget : u32 {
+        LOCAL_FB = 0,
+        COHERENT_SYSMEM = 1,
+        NONCOHERENT_SYSMEM = 2,
+    };
+    using PhysMode = BitField<0, 2, PhysModeTarget>;
+
+    union LaunchDMA {
+        enum class DataTransferType : u32 {
+            NONE = 0,
+            PIPELINED = 1,
+            NON_PIPELINED = 2,
+        };
 
-        struct Parameters {
-            union {
-                BitField<0, 4, u32> block_depth;
-                BitField<4, 4, u32> block_height;
-                BitField<8, 4, u32> block_width;
-            };
-            u32 size_x;
-            u32 size_y;
-            u32 size_z;
-            u32 pos_z;
-            union {
-                BitField<0, 16, u32> pos_x;
-                BitField<16, 16, u32> pos_y;
-            };
+        enum class SemaphoreType : u32 {
+            NONE = 0,
+            RELEASE_ONE_WORD_SEMAPHORE = 1,
+            RELEASE_FOUR_WORD_SEMAPHORE = 2,
+        };
 
-            u32 BlockHeight() const {
-                return block_height.Value();
-            }
+        enum class InterruptType : u32 {
+            NONE = 0,
+            BLOCKING = 1,
+            NON_BLOCKING = 2,
+        };
 
-            u32 BlockDepth() const {
-                return block_depth.Value();
-            }
+        enum class MemoryLayout : u32 {
+            BLOCKLINEAR = 0,
+            PITCH = 1,
         };
 
-        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
+        enum class Type : u32 {
+            VIRTUAL = 0,
+            PHYSICAL = 1,
+        };
 
-        enum class ComponentMode : u32 {
-            Src0 = 0,
-            Src1 = 1,
-            Src2 = 2,
-            Src3 = 3,
-            Const0 = 4,
-            Const1 = 5,
-            Zero = 6,
+        enum class SemaphoreReduction : u32 {
+            IMIN = 0,
+            IMAX = 1,
+            IXOR = 2,
+            IAND = 3,
+            IOR = 4,
+            IADD = 5,
+            INC = 6,
+            DEC = 7,
+            FADD = 0xA,
         };
 
-        enum class CopyMode : u32 {
-            None = 0,
-            Unk1 = 1,
-            Unk2 = 2,
+        enum class SemaphoreReductionSign : u32 {
+            SIGNED = 0,
+            UNSIGNED = 1,
         };
 
-        enum class QueryMode : u32 {
-            None = 0,
-            Short = 1,
-            Long = 2,
+        enum class BypassL2 : u32 {
+            USE_PTE_SETTING = 0,
+            FORCE_VOLATILE = 1,
         };
 
-        enum class QueryIntr : u32 {
-            None = 0,
-            Block = 1,
-            NonBlock = 2,
+        BitField<0, 2, DataTransferType> data_transfer_type;
+        BitField<2, 1, u32> flush_enable;
+        BitField<3, 2, SemaphoreType> semaphore_type;
+        BitField<5, 2, InterruptType> interrupt_type;
+        BitField<7, 1, MemoryLayout> src_memory_layout;
+        BitField<8, 1, MemoryLayout> dst_memory_layout;
+        BitField<9, 1, u32> multi_line_enable;
+        BitField<10, 1, u32> remap_enable;
+        BitField<11, 1, u32> rmwdisable;
+        BitField<12, 1, Type> src_type;
+        BitField<13, 1, Type> dst_type;
+        BitField<14, 4, SemaphoreReduction> semaphore_reduction;
+        BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign;
+        BitField<19, 1, u32> reduction_enable;
+        BitField<20, 1, BypassL2> bypass_l2;
+    };
+    static_assert(sizeof(LaunchDMA) == 4);
+
+    struct RemapConst {
+        enum Swizzle : u32 {
+            SRC_X = 0,
+            SRC_Y = 1,
+            SRC_Z = 2,
+            SRC_W = 3,
+            CONST_A = 4,
+            CONST_B = 5,
+            NO_WRITE = 6,
         };
 
+        PackedGPUVAddr address;
+
         union {
-            struct {
-                INSERT_UNION_PADDING_WORDS(0xC0);
-
-                struct {
-                    union {
-                        BitField<0, 2, CopyMode> copy_mode;
-                        BitField<2, 1, u32> flush;
-
-                        BitField<3, 2, QueryMode> query_mode;
-                        BitField<5, 2, QueryIntr> query_intr;
-
-                        BitField<7, 1, u32> is_src_linear;
-                        BitField<8, 1, u32> is_dst_linear;
-
-                        BitField<9, 1, u32> enable_2d;
-                        BitField<10, 1, u32> enable_swizzle;
-                    };
-                } exec;
-
-                INSERT_UNION_PADDING_WORDS(0x3F);
-
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } src_address;
-
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } dst_address;
-
-                u32 src_pitch;
-                u32 dst_pitch;
-                u32 x_count;
-                u32 y_count;
-
-                INSERT_UNION_PADDING_WORDS(0xB8);
-
-                u32 const0;
-                u32 const1;
-                union {
-                    BitField<0, 4, ComponentMode> component0;
-                    BitField<4, 4, ComponentMode> component1;
-                    BitField<8, 4, ComponentMode> component2;
-                    BitField<12, 4, ComponentMode> component3;
-                    BitField<16, 2, u32> component_size;
-                    BitField<20, 3, u32> src_num_components;
-                    BitField<24, 3, u32> dst_num_components;
-
-                    u32 SrcBytePerPixel() const {
-                        return src_num_components.Value() * component_size.Value();
-                    }
-                    u32 DstBytePerPixel() const {
-                        return dst_num_components.Value() * component_size.Value();
-                    }
-                } swizzle_config;
+            BitField<0, 3, Swizzle> dst_x;
+            BitField<4, 3, Swizzle> dst_y;
+            BitField<8, 3, Swizzle> dst_z;
+            BitField<12, 3, Swizzle> dst_w;
+            BitField<16, 2, u32> component_size_minus_one;
+            BitField<20, 2, u32> num_src_components_minus_one;
+            BitField<24, 2, u32> num_dst_components_minus_one;
+        };
+    };
+    static_assert(sizeof(RemapConst) == 12);
 
-                Parameters dst_params;
+    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
+    ~MaxwellDMA() = default;
 
-                INSERT_UNION_PADDING_WORDS(1);
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
 
-                Parameters src_params;
-
-                INSERT_UNION_PADDING_WORDS(0x13);
-            };
-            std::array<u32, NUM_REGS> reg_array;
-        };
-    } regs{};
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
 
 private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void Launch();
+
+    void CopyPitchToPitch();
+
+    void CopyBlockLinearToPitch();
+
+    void CopyPitchToBlockLinear();
+
+    void FastCopyBlockLinearToPitch();
+
     Core::System& system;
 
     MemoryManager& memory_manager;
@@ -180,28 +215,58 @@ private:
     std::vector<u8> read_buffer;
     std::vector<u8> write_buffer;
 
-    /// Performs the copy from the source buffer to the destination buffer as configured in the
-    /// registers.
-    void HandleCopy();
-};
+    static constexpr std::size_t NUM_REGS = 0x800;
+    struct Regs {
+        union {
+            struct {
+                u32 reserved[0x40];
+                u32 nop;
+                u32 reserved01[0xf];
+                u32 pm_trigger;
+                u32 reserved02[0x3f];
+                Semaphore semaphore;
+                u32 reserved03[0x2];
+                RenderEnable render_enable;
+                PhysMode src_phys_mode;
+                PhysMode dst_phys_mode;
+                u32 reserved04[0x26];
+                LaunchDMA launch_dma;
+                u32 reserved05[0x3f];
+                PackedGPUVAddr offset_in;
+                PackedGPUVAddr offset_out;
+                u32 pitch_in;
+                u32 pitch_out;
+                u32 line_length_in;
+                u32 line_count;
+                u32 reserved06[0xb8];
+                RemapConst remap_const;
+                Parameters dst_params;
+                u32 reserved07[0x1];
+                Parameters src_params;
+                u32 reserved08[0x275];
+                u32 pm_trigger_end;
+                u32 reserved09[0x3ba];
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
                   "Field " #field_name " has invalid position")
 
-ASSERT_REG_POSITION(exec, 0xC0);
-ASSERT_REG_POSITION(src_address, 0x100);
-ASSERT_REG_POSITION(dst_address, 0x102);
-ASSERT_REG_POSITION(src_pitch, 0x104);
-ASSERT_REG_POSITION(dst_pitch, 0x105);
-ASSERT_REG_POSITION(x_count, 0x106);
-ASSERT_REG_POSITION(y_count, 0x107);
-ASSERT_REG_POSITION(const0, 0x1C0);
-ASSERT_REG_POSITION(const1, 0x1C1);
-ASSERT_REG_POSITION(swizzle_config, 0x1C2);
-ASSERT_REG_POSITION(dst_params, 0x1C3);
-ASSERT_REG_POSITION(src_params, 0x1CA);
+    ASSERT_REG_POSITION(launch_dma, 0xC0);
+    ASSERT_REG_POSITION(offset_in, 0x100);
+    ASSERT_REG_POSITION(offset_out, 0x102);
+    ASSERT_REG_POSITION(pitch_in, 0x104);
+    ASSERT_REG_POSITION(pitch_out, 0x105);
+    ASSERT_REG_POSITION(line_length_in, 0x106);
+    ASSERT_REG_POSITION(line_count, 0x107);
+    ASSERT_REG_POSITION(remap_const, 0x1C0);
+    ASSERT_REG_POSITION(dst_params, 0x1C3);
+    ASSERT_REG_POSITION(src_params, 0x1CA);
 
 #undef ASSERT_REG_POSITION
+};
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 5e9cfba22..37d17efdc 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -32,31 +32,31 @@ struct Register {
 
     constexpr Register() = default;
 
-    constexpr Register(u64 value) : value(value) {}
+    constexpr Register(u64 value_) : value(value_) {}
 
-    constexpr operator u64() const {
+    [[nodiscard]] constexpr operator u64() const {
         return value;
     }
 
     template <typename T>
-    constexpr u64 operator-(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator-(const T& oth) const {
         return value - oth;
     }
 
     template <typename T>
-    constexpr u64 operator&(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const T& oth) const {
         return value & oth;
     }
 
-    constexpr u64 operator&(const Register& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const Register& oth) const {
         return value & oth.value;
     }
 
-    constexpr u64 operator~() const {
+    [[nodiscard]] constexpr u64 operator~() const {
         return ~value;
     }
 
-    u64 GetSwizzledIndex(u64 elem) const {
+    [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const {
         elem = (value + elem) & 3;
         return (value & ~3) + elem;
     }
@@ -75,7 +75,7 @@ enum class AttributeSize : u64 {
 union Attribute {
     Attribute() = default;
 
-    constexpr explicit Attribute(u64 value) : value(value) {}
+    constexpr explicit Attribute(u64 value_) : value(value_) {}
 
     enum class Index : u64 {
         LayerViewportPointSize = 6,
@@ -107,7 +107,7 @@ union Attribute {
         BitField<31, 1, u64> patch;
         BitField<47, 3, AttributeSize> size;
 
-        bool IsPhysical() const {
+        [[nodiscard]] bool IsPhysical() const {
             return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0;
         }
     } fmt20;
@@ -124,7 +124,7 @@ union Attribute {
 union Sampler {
     Sampler() = default;
 
-    constexpr explicit Sampler(u64 value) : value(value) {}
+    constexpr explicit Sampler(u64 value_) : value(value_) {}
 
     enum class Index : u64 {
         Sampler_0 = 8,
@@ -137,7 +137,7 @@ union Sampler {
 union Image {
     Image() = default;
 
-    constexpr explicit Image(u64 value) : value{value} {}
+    constexpr explicit Image(u64 value_) : value{value_} {}
 
     BitField<36, 13, u64> index;
     u64 value;
@@ -168,18 +168,22 @@ enum class Pred : u64 {
 };
 
 enum class PredCondition : u64 {
-    LessThan = 1,
-    Equal = 2,
-    LessEqual = 3,
-    GreaterThan = 4,
-    NotEqual = 5,
-    GreaterEqual = 6,
-    LessThanWithNan = 9,
-    LessEqualWithNan = 11,
-    GreaterThanWithNan = 12,
-    NotEqualWithNan = 13,
-    GreaterEqualWithNan = 14,
-    // TODO(Subv): Other condition types
+    F = 0,    // Always false
+    LT = 1,   // Ordered less than
+    EQ = 2,   // Ordered equal
+    LE = 3,   // Ordered less than or equal
+    GT = 4,   // Ordered greater than
+    NE = 5,   // Ordered not equal
+    GE = 6,   // Ordered greater than or equal
+    NUM = 7,  // Ordered
+    NAN_ = 8, // Unordered
+    LTU = 9,  // Unordered less than
+    EQU = 10, // Unordered equal
+    LEU = 11, // Unordered less than or equal
+    GTU = 12, // Unordered greater than
+    NEU = 13, // Unordered not equal
+    GEU = 14, // Unordered greater than or equal
+    T = 15,   // Always true
 };
 
 enum class PredOperation : u64 {
@@ -501,14 +505,14 @@ struct IpaMode {
     IpaInterpMode interpolation_mode;
     IpaSampleMode sampling_mode;
 
-    bool operator==(const IpaMode& a) const {
+    [[nodiscard]] bool operator==(const IpaMode& a) const {
         return std::tie(interpolation_mode, sampling_mode) ==
                std::tie(a.interpolation_mode, a.sampling_mode);
     }
-    bool operator!=(const IpaMode& a) const {
+    [[nodiscard]] bool operator!=(const IpaMode& a) const {
         return !operator==(a);
     }
-    bool operator<(const IpaMode& a) const {
+    [[nodiscard]] bool operator<(const IpaMode& a) const {
         return std::tie(interpolation_mode, sampling_mode) <
                std::tie(a.interpolation_mode, a.sampling_mode);
     }
@@ -654,7 +658,12 @@ union Instruction {
         return *this;
     }
 
-    constexpr Instruction(u64 value) : value{value} {}
+    constexpr Instruction(u64 value_) : value{value_} {}
+    constexpr Instruction(const Instruction& instr) : value(instr.value) {}
+
+    [[nodiscard]] constexpr bool Bit(u64 offset) const {
+        return ((value >> offset) & 1) != 0;
+    }
 
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
@@ -737,34 +746,34 @@ union Instruction {
             BitField<28, 8, u64> imm_lut28;
             BitField<48, 8, u64> imm_lut48;
 
-            u32 GetImmLut28() const {
+            [[nodiscard]] u32 GetImmLut28() const {
                 return static_cast<u32>(imm_lut28);
             }
 
-            u32 GetImmLut48() const {
+            [[nodiscard]] u32 GetImmLut48() const {
                 return static_cast<u32>(imm_lut48);
             }
         } lop3;
 
-        u16 GetImm20_16() const {
+        [[nodiscard]] u16 GetImm20_16() const {
             return static_cast<u16>(imm20_16);
         }
 
-        u32 GetImm20_19() const {
+        [[nodiscard]] u32 GetImm20_19() const {
             u32 imm{static_cast<u32>(imm20_19)};
             imm <<= 12;
             imm |= negate_imm ? 0x80000000 : 0;
             return imm;
         }
 
-        u32 GetImm20_32() const {
+        [[nodiscard]] u32 GetImm20_32() const {
             return static_cast<u32>(imm20_32);
         }
 
-        s32 GetSignedImm20_20() const {
-            u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
+        [[nodiscard]] s32 GetSignedImm20_20() const {
+            const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
             // Sign extend the 20-bit value.
-            u32 mask = 1U << (20 - 1);
+            const auto mask = 1U << (20 - 1);
             return static_cast<s32>((immediate ^ mask) - mask);
         }
     } alu;
@@ -813,15 +822,17 @@ union Instruction {
     } alu_integer;
 
     union {
+        BitField<43, 1, u64> x;
+    } iadd;
+
+    union {
         BitField<39, 1, u64> ftz;
         BitField<32, 1, u64> saturate;
         BitField<49, 2, HalfMerge> merge;
 
-        BitField<43, 1, u64> negate_a;
         BitField<44, 1, u64> abs_a;
         BitField<47, 2, HalfType> type_a;
 
-        BitField<31, 1, u64> negate_b;
         BitField<30, 1, u64> abs_b;
         BitField<28, 2, HalfType> type_b;
 
@@ -846,7 +857,7 @@ union Instruction {
         BitField<56, 1, u64> second_negate;
         BitField<30, 9, u64> second;
 
-        u32 PackImmediates() const {
+        [[nodiscard]] u32 PackImmediates() const {
             // Immediates are half floats shifted.
             constexpr u32 imm_shift = 6;
             return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift)));
@@ -1022,7 +1033,7 @@ union Instruction {
         BitField<28, 2, AtomicType> type;
         BitField<30, 22, s64> offset;
 
-        s32 GetImmediateOffset() const {
+        [[nodiscard]] s32 GetImmediateOffset() const {
             return static_cast<s32>(offset << 2);
         }
     } atoms;
@@ -1204,7 +1215,7 @@ union Instruction {
             BitField<39, 4, u64> rounding;
             // H0, H1 extract for F16 missing
             BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
-            F2fRoundingOp GetRoundingMode() const {
+            [[nodiscard]] F2fRoundingOp GetRoundingMode() const {
                 constexpr u64 rounding_mask = 0x0B;
                 return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
             }
@@ -1228,15 +1239,15 @@ union Instruction {
         BitField<54, 1, u64> aoffi_flag;
         BitField<55, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1260,15 +1271,15 @@ union Instruction {
         BitField<36, 1, u64> aoffi_flag;
         BitField<37, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             return ((1ULL << component) & component_mask) != 0;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1288,7 +1299,7 @@ union Instruction {
         BitField<31, 4, u64> component_mask;
         BitField<49, 1, u64> nodep_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NODEP:
                 return nodep_flag != 0;
@@ -1298,7 +1309,7 @@ union Instruction {
             return false;
         }
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             return ((1ULL << component) & component_mask) != 0;
         }
     } txq;
@@ -1310,11 +1321,11 @@ union Instruction {
         BitField<35, 1, u64> ndv_flag;
         BitField<49, 1, u64> nodep_flag;
 
-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return (ndv_flag != 0);
@@ -1336,7 +1347,7 @@ union Instruction {
         BitField<54, 2, u64> offset_mode;
         BitField<56, 2, u64> component;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return ndv_flag != 0;
@@ -1362,7 +1373,7 @@ union Instruction {
         BitField<33, 2, u64> offset_mode;
         BitField<37, 2, u64> component;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::NDV:
                 return ndv_flag != 0;
@@ -1388,7 +1399,7 @@ union Instruction {
         BitField<52, 2, u64> component;
         BitField<55, 1, u64> fp16_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return dc_flag != 0;
@@ -1411,16 +1422,20 @@ union Instruction {
         BitField<53, 4, u64> texture_info;
         BitField<59, 1, u64> fp32_flag;
 
-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
             // The TEXS instruction has a weird encoding for the texture type.
-            if (texture_info == 0)
+            if (texture_info == 0) {
                 return TextureType::Texture1D;
-            if (texture_info >= 1 && texture_info <= 9)
+            }
+            if (texture_info >= 1 && texture_info <= 9) {
                 return TextureType::Texture2D;
-            if (texture_info >= 10 && texture_info <= 11)
+            }
+            if (texture_info >= 10 && texture_info <= 11) {
                 return TextureType::Texture3D;
-            if (texture_info >= 12 && texture_info <= 13)
+            }
+            if (texture_info >= 12 && texture_info <= 13) {
                 return TextureType::TextureCube;
+            }
 
             LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
                          static_cast<u32>(texture_info.Value()));
@@ -1428,7 +1443,7 @@ union Instruction {
             return TextureType::Texture1D;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             switch (texture_info) {
             case 0:
             case 2:
@@ -1447,7 +1462,7 @@ union Instruction {
             return TextureProcessMode::None;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::DC:
                 return (texture_info >= 4 && texture_info <= 6) || texture_info == 9;
@@ -1459,16 +1474,16 @@ union Instruction {
             return false;
         }
 
-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info >= 7 && texture_info <= 9;
         }
 
-        bool HasTwoDestinations() const {
+        [[nodiscard]] bool HasTwoDestinations() const {
             return gpr28.Value() != Register::ZeroIndex;
         }
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
                 {},
                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@@ -1495,7 +1510,7 @@ union Instruction {
         BitField<54, 1, u64> cl;
         BitField<55, 1, u64> process_mode;
 
-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
             return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
         }
     } tld;
@@ -1505,9 +1520,9 @@ union Instruction {
         BitField<53, 4, u64> texture_info;
         BitField<59, 1, u64> fp32_flag;
 
-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
             // The TLDS instruction has a weird encoding for the texture type.
-            if (texture_info >= 0 && texture_info <= 1) {
+            if (texture_info <= 1) {
                 return TextureType::Texture1D;
             }
             if (texture_info == 2 || texture_info == 8 || texture_info == 12 ||
@@ -1524,13 +1539,14 @@ union Instruction {
             return TextureType::Texture1D;
         }
 
-        TextureProcessMode GetTextureProcessMode() const {
-            if (texture_info == 1 || texture_info == 5 || texture_info == 12)
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
+            if (texture_info == 1 || texture_info == 5 || texture_info == 12) {
                 return TextureProcessMode::LL;
+            }
             return TextureProcessMode::LZ;
         }
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::AOFFI:
                 return texture_info == 12 || texture_info == 4;
@@ -1544,7 +1560,7 @@ union Instruction {
             return false;
         }
 
-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info == 8;
         }
@@ -1556,7 +1572,7 @@ union Instruction {
         BitField<35, 1, u64> aoffi_flag;
         BitField<49, 1, u64> nodep_flag;
 
-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
             switch (mode) {
             case TextureMiscMode::AOFFI:
                 return aoffi_flag != 0;
@@ -1580,7 +1596,7 @@ union Instruction {
         BitField<20, 3, StoreType> store_data_layout;
         BitField<20, 4, u64> component_mask_selector;
 
-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
             ASSERT(mode == SurfaceDataMode::P);
             constexpr u8 R = 0b0001;
             constexpr u8 G = 0b0010;
@@ -1593,7 +1609,7 @@ union Instruction {
             return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
         }
 
-        StoreType GetStoreDataLayout() const {
+        [[nodiscard]] StoreType GetStoreDataLayout() const {
             ASSERT(mode == SurfaceDataMode::D_BA);
             return store_data_layout;
         }
@@ -1611,14 +1627,15 @@ union Instruction {
         BitField<20, 24, u64> target;
         BitField<5, 1, u64> constant_buffer;
 
-        s32 GetBranchTarget() const {
+        [[nodiscard]] s32 GetBranchTarget() const {
             // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
             // The branch offset is relative to the next instruction and is stored in bytes, so
             // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
         }
     } bra;
 
@@ -1626,14 +1643,15 @@ union Instruction {
         BitField<20, 24, u64> target;
         BitField<5, 1, u64> constant_buffer;
 
-        s32 GetBranchExtend() const {
+        [[nodiscard]] s32 GetBranchExtend() const {
             // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
             // The branch offset is relative to the next instruction and is stored in bytes, so
             // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
         }
     } brx;
 
@@ -1686,7 +1704,7 @@ union Instruction {
         BitField<50, 1, u64> is_op_b_register;
         BitField<51, 3, VmnmxOperation> operation;
 
-        VmnmxType SourceFormatA() const {
+        [[nodiscard]] VmnmxType SourceFormatA() const {
             switch (src_format_a) {
             case 0b11:
                 return VmnmxType::Bits32;
@@ -1697,7 +1715,7 @@ union Instruction {
             }
         }
 
-        VmnmxType SourceFormatB() const {
+        [[nodiscard]] VmnmxType SourceFormatB() const {
             switch (src_format_b) {
             case 0b11:
                 return VmnmxType::Bits32;
@@ -1728,7 +1746,7 @@ union Instruction {
         BitField<20, 14, u64> shifted_offset;
         BitField<34, 5, u64> index;
 
-        u64 GetOffset() const {
+        [[nodiscard]] u64 GetOffset() const {
             return shifted_offset * 4;
         }
     } cbuf34;
@@ -1737,7 +1755,7 @@ union Instruction {
         BitField<20, 16, s64> offset;
         BitField<36, 5, u64> index;
 
-        s64 GetOffset() const {
+        [[nodiscard]] s64 GetOffset() const {
             return offset;
         }
     } cbuf36;
@@ -1867,7 +1885,9 @@ public:
         HSETP2_C,
         HSETP2_R,
         HSETP2_IMM,
+        HSET2_C,
         HSET2_R,
+        HSET2_IMM,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -1880,6 +1900,7 @@ public:
         ICMP_IMM,
         FCMP_RR,
         FCMP_RC,
+        FCMP_IMMR,
         MUFU,  // Multi-Function Operator
         RRO_C, // Range Reduction Operator
         RRO_R,
@@ -1983,29 +2004,29 @@ public:
 
     /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be
     /// conditionally executed).
-    static bool IsPredicatedInstruction(Id opcode) {
+    [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) {
         // TODO(Subv): Add the rest of unpredicated instructions.
         return opcode != Id::SSY && opcode != Id::PBK;
     }
 
     class Matcher {
     public:
-        constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type)
-            : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}
+        constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_)
+            : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {}
 
-        constexpr const char* GetName() const {
+        [[nodiscard]] constexpr const char* GetName() const {
             return name;
         }
 
-        constexpr u16 GetMask() const {
+        [[nodiscard]] constexpr u16 GetMask() const {
             return mask;
         }
 
-        constexpr Id GetId() const {
+        [[nodiscard]] constexpr Id GetId() const {
             return id;
         }
 
-        constexpr Type GetType() const {
+        [[nodiscard]] constexpr Type GetType() const {
             return type;
         }
 
@@ -2014,7 +2035,7 @@ public:
          * @param instruction The instruction to test
          * @returns true if the given instruction matches.
          */
-        constexpr bool Matches(u16 instruction) const {
+        [[nodiscard]] constexpr bool Matches(u16 instruction) const {
             return (instruction & mask) == expected;
         }
 
@@ -2026,7 +2047,8 @@ public:
         Type type;
     };
 
-    static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) {
+    using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>;
+    [[nodiscard]] static DecodeResult Decode(Instruction instr) {
         static const auto table{GetDecodeTable()};
 
         const auto matches_instruction = [instr](const auto& matcher) {
@@ -2048,7 +2070,7 @@ private:
          * A '0' in a bitstring indicates that a zero must be present at that bit position.
          * A '1' in a bitstring indicates that a one must be present at that bit position.
          */
-        static constexpr auto GetMaskAndExpect(const char* const bitstring) {
+        [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) {
             u16 mask = 0, expect = 0;
             for (std::size_t i = 0; i < opcode_bitsize; i++) {
                 const std::size_t bit_position = opcode_bitsize - i - 1;
@@ -2070,14 +2092,14 @@ private:
 
     public:
         /// Creates a matcher that can match and parse instructions based on bitstring.
-        static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type,
-                                         const char* const name) {
+        [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op,
+                                                       Type type, const char* const name) {
             const auto [mask, expected] = GetMaskAndExpect(bitstring);
             return Matcher(name, mask, expected, op, type);
         }
     };
 
-    static std::vector<Matcher> GetDecodeTable() {
+    [[nodiscard]] static std::vector<Matcher> GetDecodeTable() {
         std::vector<Matcher> table = {
 #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
             INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
@@ -2187,9 +2209,12 @@ private:
             INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
+            INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h
index 72e2a33d5..ceec05459 100644
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -41,30 +41,30 @@ struct Header {
         BitField<26, 1, u32> does_load_or_store;
         BitField<27, 1, u32> does_fp64;
         BitField<28, 4, u32> stream_out_mask;
-    } common0{};
+    } common0;
 
     union {
         BitField<0, 24, u32> shader_local_memory_low_size;
         BitField<24, 8, u32> per_patch_attribute_count;
-    } common1{};
+    } common1;
 
     union {
         BitField<0, 24, u32> shader_local_memory_high_size;
         BitField<24, 8, u32> threads_per_input_primitive;
-    } common2{};
+    } common2;
 
     union {
         BitField<0, 24, u32> shader_local_memory_crs_size;
         BitField<24, 4, OutputTopology> output_topology;
         BitField<28, 4, u32> reserved;
-    } common3{};
+    } common3;
 
     union {
         BitField<0, 12, u32> max_output_vertices;
         BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
         BitField<20, 4, u32> reserved;
         BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
-    } common4{};
+    } common4;
 
     union {
         struct {
@@ -145,7 +145,7 @@ struct Header {
             }
         } ps;
 
-        std::array<u32, 0xF> raw{};
+        std::array<u32, 0xF> raw;
     };
 
     u64 GetLocalMemorySize() const {
@@ -153,7 +153,6 @@ struct Header {
                 (common2.shader_local_memory_high_size << 24));
     }
 };
-
 static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
 
 } // namespace Tegra::Shader
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
new file mode 100644
index 000000000..de6991ef6
--- /dev/null
+++ b/src/video_core/fence_manager.h
@@ -0,0 +1,164 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <queue>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+class FenceBase {
+public:
+    FenceBase(u32 payload, bool is_stubbed)
+        : address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {}
+
+    FenceBase(GPUVAddr address, u32 payload, bool is_stubbed)
+        : address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {}
+
+    GPUVAddr GetAddress() const {
+        return address;
+    }
+
+    u32 GetPayload() const {
+        return payload;
+    }
+
+    bool IsSemaphore() const {
+        return is_semaphore;
+    }
+
+private:
+    GPUVAddr address;
+    u32 payload;
+    bool is_semaphore;
+
+protected:
+    bool is_stubbed;
+};
+
+template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
+class FenceManager {
+public:
+    void SignalSemaphore(GPUVAddr addr, u32 value) {
+        TryReleasePendingFences();
+        const bool should_flush = ShouldFlush();
+        CommitAsyncFlushes();
+        TFence new_fence = CreateFence(addr, value, !should_flush);
+        fences.push(new_fence);
+        QueueFence(new_fence);
+        if (should_flush) {
+            rasterizer.FlushCommands();
+        }
+        rasterizer.SyncGuestHost();
+    }
+
+    void SignalSyncPoint(u32 value) {
+        TryReleasePendingFences();
+        const bool should_flush = ShouldFlush();
+        CommitAsyncFlushes();
+        TFence new_fence = CreateFence(value, !should_flush);
+        fences.push(new_fence);
+        QueueFence(new_fence);
+        if (should_flush) {
+            rasterizer.FlushCommands();
+        }
+        rasterizer.SyncGuestHost();
+    }
+
+    void WaitPendingFences() {
+        while (!fences.empty()) {
+            TFence& current_fence = fences.front();
+            if (ShouldWait()) {
+                WaitFence(current_fence);
+            }
+            PopAsyncFlushes();
+            if (current_fence->IsSemaphore()) {
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
+            } else {
+                gpu.IncrementSyncPoint(current_fence->GetPayload());
+            }
+            fences.pop();
+        }
+    }
+
+protected:
+    explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
+                          TTextureCache& texture_cache_, TTBufferCache& buffer_cache_,
+                          TQueryCache& query_cache_)
+        : rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()},
+          texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {}
+
+    virtual ~FenceManager() = default;
+
+    /// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is
+    /// true
+    virtual TFence CreateFence(u32 value, bool is_stubbed) = 0;
+    /// Creates a Semaphore Fence Interface, does not create a backend fence if 'is_stubbed' is true
+    virtual TFence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) = 0;
+    /// Queues a fence into the backend if the fence isn't stubbed.
+    virtual void QueueFence(TFence& fence) = 0;
+    /// Notifies that the backend fence has been signaled/reached in host GPU.
+    virtual bool IsFenceSignaled(TFence& fence) const = 0;
+    /// Waits until a fence has been signalled by the host GPU.
+    virtual void WaitFence(TFence& fence) = 0;
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    TTextureCache& texture_cache;
+    TTBufferCache& buffer_cache;
+    TQueryCache& query_cache;
+
+private:
+    void TryReleasePendingFences() {
+        while (!fences.empty()) {
+            TFence& current_fence = fences.front();
+            if (ShouldWait() && !IsFenceSignaled(current_fence)) {
+                return;
+            }
+            PopAsyncFlushes();
+            if (current_fence->IsSemaphore()) {
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
+            } else {
+                gpu.IncrementSyncPoint(current_fence->GetPayload());
+            }
+            fences.pop();
+        }
+    }
+
+    bool ShouldWait() const {
+        return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
+               query_cache.ShouldWaitAsyncFlushes();
+    }
+
+    bool ShouldFlush() const {
+        return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
+               query_cache.HasUncommittedFlushes();
+    }
+
+    void PopAsyncFlushes() {
+        texture_cache.PopAsyncFlushes();
+        buffer_cache.PopAsyncFlushes();
+        query_cache.PopAsyncFlushes();
+    }
+
+    void CommitAsyncFlushes() {
+        texture_cache.CommitAsyncFlushes();
+        buffer_cache.CommitAsyncFlushes();
+        query_cache.CommitAsyncFlushes();
+    }
+
+    std::queue<TFence> fences;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8acf2eda2..ebd149c3a 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
+
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
@@ -9,6 +11,7 @@
 #include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
 #include "core/memory.h"
+#include "core/settings.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/kepler_memory.h"
@@ -17,26 +20,36 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
+#include "video_core/shader_notify.h"
 #include "video_core/video_core.h"
 
 namespace Tegra {
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 
-GPU::GPU(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer_, bool is_async)
-    : system{system}, renderer{std::move(renderer_)}, is_async{is_async} {
-    auto& rasterizer{renderer->Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
-    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
-    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
-    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
-}
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
+    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
+      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
+      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
+      fermi_2d{std::make_unique<Engines::Fermi2D>()},
+      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
+      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
+      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
+      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {}
 
 GPU::~GPU() = default;
 
+void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
+    renderer = std::move(renderer_);
+
+    VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer();
+    memory_manager->BindRasterizer(rasterizer);
+    maxwell_3d->BindRasterizer(rasterizer);
+    fermi_2d->BindRasterizer(rasterizer);
+    kepler_compute->BindRasterizer(rasterizer);
+}
+
 Engines::Maxwell3D& GPU::Maxwell3D() {
     return *maxwell_3d;
 }
@@ -65,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
     return *dma_pusher;
 }
 
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
 const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     // Synced GPU, is always in sync
     if (!is_async) {
@@ -76,7 +97,7 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     }
     MICROPROFILE_SCOPE(GPU_wait);
     std::unique_lock lock{sync_mutex};
-    sync_cv.wait(lock, [=]() { return syncpoints[syncpoint_id].load() >= value; });
+    sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; });
 }
 
 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
@@ -125,14 +146,38 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
     return true;
 }
 
+u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
+    std::unique_lock lck{flush_request_mutex};
+    const u64 fence = ++last_flush_fence;
+    flush_requests.emplace_back(fence, addr, size);
+    return fence;
+}
+
+void GPU::TickWork() {
+    std::unique_lock lck{flush_request_mutex};
+    while (!flush_requests.empty()) {
+        auto& request = flush_requests.front();
+        const u64 fence = request.fence;
+        const VAddr addr = request.addr;
+        const std::size_t size = request.size;
+        flush_requests.pop_front();
+        flush_request_mutex.unlock();
+        renderer->Rasterizer().FlushRegion(addr, size);
+        current_flush_fence.store(fence);
+        flush_request_mutex.lock();
+    }
+}
+
 u64 GPU::GetTicks() const {
     // This values were reversed engineered by fincs from NVN
     // The gpu clock is reported in units of 385/625 nanoseconds
     constexpr u64 gpu_ticks_num = 384;
     constexpr u64 gpu_ticks_den = 625;
 
-    const u64 cpu_ticks = system.CoreTiming().GetTicks();
-    const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+    if (Settings::values.use_fast_gpu_time.GetValue()) {
+        nanoseconds /= 256;
+    }
     const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
     const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
     return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
@@ -142,30 +187,13 @@ void GPU::FlushCommands() {
     renderer->Rasterizer().FlushCommands();
 }
 
-// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
-// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
-// So the values you see in docs might be multiplied by 4.
-enum class BufferMethods {
-    BindObject = 0x0,
-    Nop = 0x2,
-    SemaphoreAddressHigh = 0x4,
-    SemaphoreAddressLow = 0x5,
-    SemaphoreSequence = 0x6,
-    SemaphoreTrigger = 0x7,
-    NotifyIntr = 0x8,
-    WrcacheFlush = 0x9,
-    Unk28 = 0xA,
-    UnkCacheFlush = 0xB,
-    RefCnt = 0x14,
-    SemaphoreAcquire = 0x1A,
-    SemaphoreRelease = 0x1B,
-    FenceValue = 0x1C,
-    FenceAction = 0x1D,
-    Unk78 = 0x1E,
-    Unk7c = 0x1F,
-    Yield = 0x20,
-    NonPullerMethods = 0x40,
-};
+void GPU::SyncGuestHost() {
+    renderer->Rasterizer().SyncGuestHost();
+}
+
+void GPU::OnCommandListEnd() {
+    renderer->Rasterizer().ReleaseFences();
+}
 
 enum class GpuSemaphoreOperation {
     AcquireEqual = 0x1,
@@ -180,16 +208,32 @@ void GPU::CallMethod(const MethodCall& method_call) {
 
     ASSERT(method_call.subchannel < bound_engines.size());
 
-    if (ExecuteMethodOnEngine(method_call)) {
+    if (ExecuteMethodOnEngine(method_call.method)) {
         CallEngineMethod(method_call);
     } else {
         CallPullerMethod(method_call);
     }
 }
 
-bool GPU::ExecuteMethodOnEngine(const MethodCall& method_call) {
-    const auto method = static_cast<BufferMethods>(method_call.method);
-    return method >= BufferMethods::NonPullerMethods;
+void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                          u32 methods_pending) {
+    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
+
+    ASSERT(subchannel < bound_engines.size());
+
+    if (ExecuteMethodOnEngine(method)) {
+        CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
+    } else {
+        for (std::size_t i = 0; i < amount; i++) {
+            CallPullerMethod(
+                {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)});
+        }
+    }
+}
+
+bool GPU::ExecuteMethodOnEngine(u32 method) {
+    const auto buffer_method = static_cast<BufferMethods>(method);
+    return buffer_method >= BufferMethods::NonPullerMethods;
 }
 
 void GPU::CallPullerMethod(const MethodCall& method_call) {
@@ -209,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::UnkCacheFlush:
     case BufferMethods::WrcacheFlush:
     case BufferMethods::FenceValue:
+        break;
     case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
         break;
     case BufferMethods::SemaphoreTrigger: {
         ProcessSemaphoreTriggerMethod();
@@ -250,19 +299,46 @@ void GPU::CallEngineMethod(const MethodCall& method_call) {
 
     switch (engine) {
     case EngineID::FERMI_TWOD_A:
-        fermi_2d->CallMethod(method_call);
+        fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::MAXWELL_B:
+        maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::KEPLER_COMPUTE_B:
+        kepler_compute->CallMethod(method_call.method, method_call.argument,
+                                   method_call.IsLastCall());
+        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+        kepler_memory->CallMethod(method_call.method, method_call.argument,
+                                  method_call.IsLastCall());
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented engine");
+    }
+}
+
+void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                                u32 methods_pending) {
+    const EngineID engine = bound_engines[subchannel];
+
+    switch (engine) {
+    case EngineID::FERMI_TWOD_A:
+        fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
         break;
     case EngineID::MAXWELL_B:
-        maxwell_3d->CallMethod(method_call);
+        maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
         break;
     case EngineID::KEPLER_COMPUTE_B:
-        kepler_compute->CallMethod(method_call);
+        kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
         break;
     case EngineID::MAXWELL_DMA_COPY_A:
-        maxwell_dma->CallMethod(method_call);
+        maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
         break;
     case EngineID::KEPLER_INLINE_TO_MEMORY_B:
-        kepler_memory->CallMethod(method_call);
+        kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
         break;
     default:
         UNIMPLEMENTED_MSG("Unimplemented engine");
@@ -273,7 +349,46 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
     // Bind the current subchannel to the desired engine id.
     LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
               method_call.argument);
-    bound_engines[method_call.subchannel] = static_cast<EngineID>(method_call.argument);
+    const auto engine_id = static_cast<EngineID>(method_call.argument);
+    bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
+    switch (engine_id) {
+    case EngineID::FERMI_TWOD_A:
+        dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
+        break;
+    case EngineID::MAXWELL_B:
+        dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
+        break;
+    case EngineID::KEPLER_COMPUTE_B:
+        dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
+        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
+        break;
+    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+        dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id));
+    }
+}
+
+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}",
+                          static_cast<u32>(regs.fence_action.op.Value()));
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
 }
 
 void GPU::ProcessSemaphoreTriggerMethod() {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 1a2d747be..21410e125 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -13,14 +13,15 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
 
 using CacheAddr = std::uintptr_t;
-inline CacheAddr ToCacheAddr(const void* host_ptr) {
+[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
     return reinterpret_cast<CacheAddr>(host_ptr);
 }
 
-inline u8* FromCacheAddr(CacheAddr cache_addr) {
+[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
     return reinterpret_cast<u8*>(cache_addr);
 }
 
@@ -33,58 +34,68 @@ class System;
 
 namespace VideoCore {
 class RendererBase;
+class ShaderNotify;
 } // namespace VideoCore
 
 namespace Tegra {
 
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
-    RGBA32_FLOAT = 0xC0,
-    RGBA32_UINT = 0xC2,
-    RGBA16_UNORM = 0xC6,
-    RGBA16_SNORM = 0xC7,
-    RGBA16_UINT = 0xC9,
-    RGBA16_FLOAT = 0xCA,
-    RG32_FLOAT = 0xCB,
-    RG32_UINT = 0xCD,
-    RGBX16_FLOAT = 0xCE,
-    BGRA8_UNORM = 0xCF,
-    BGRA8_SRGB = 0xD0,
-    RGB10_A2_UNORM = 0xD1,
-    RGBA8_UNORM = 0xD5,
-    RGBA8_SRGB = 0xD6,
-    RGBA8_SNORM = 0xD7,
-    RGBA8_UINT = 0xD9,
-    RG16_UNORM = 0xDA,
-    RG16_SNORM = 0xDB,
-    RG16_SINT = 0xDC,
-    RG16_UINT = 0xDD,
-    RG16_FLOAT = 0xDE,
-    R11G11B10_FLOAT = 0xE0,
+    R32B32G32A32_FLOAT = 0xC0,
+    R32G32B32A32_SINT = 0xC1,
+    R32G32B32A32_UINT = 0xC2,
+    R16G16B16A16_UNORM = 0xC6,
+    R16G16B16A16_SNORM = 0xC7,
+    R16G16B16A16_SINT = 0xC8,
+    R16G16B16A16_UINT = 0xC9,
+    R16G16B16A16_FLOAT = 0xCA,
+    R32G32_FLOAT = 0xCB,
+    R32G32_SINT = 0xCC,
+    R32G32_UINT = 0xCD,
+    R16G16B16X16_FLOAT = 0xCE,
+    B8G8R8A8_UNORM = 0xCF,
+    B8G8R8A8_SRGB = 0xD0,
+    A2B10G10R10_UNORM = 0xD1,
+    A2B10G10R10_UINT = 0xD2,
+    A8B8G8R8_UNORM = 0xD5,
+    A8B8G8R8_SRGB = 0xD6,
+    A8B8G8R8_SNORM = 0xD7,
+    A8B8G8R8_SINT = 0xD8,
+    A8B8G8R8_UINT = 0xD9,
+    R16G16_UNORM = 0xDA,
+    R16G16_SNORM = 0xDB,
+    R16G16_SINT = 0xDC,
+    R16G16_UINT = 0xDD,
+    R16G16_FLOAT = 0xDE,
+    B10G11R11_FLOAT = 0xE0,
     R32_SINT = 0xE3,
     R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
-    B5G6R5_UNORM = 0xE8,
-    BGR5A1_UNORM = 0xE9,
-    RG8_UNORM = 0xEA,
-    RG8_SNORM = 0xEB,
+    R5G6B5_UNORM = 0xE8,
+    A1R5G5B5_UNORM = 0xE9,
+    R8G8_UNORM = 0xEA,
+    R8G8_SNORM = 0xEB,
+    R8G8_SINT = 0xEC,
+    R8G8_UINT = 0xED,
     R16_UNORM = 0xEE,
     R16_SNORM = 0xEF,
     R16_SINT = 0xF0,
     R16_UINT = 0xF1,
     R16_FLOAT = 0xF2,
     R8_UNORM = 0xF3,
+    R8_SNORM = 0xF4,
+    R8_SINT = 0xF5,
     R8_UINT = 0xF6,
 };
 
 enum class DepthFormat : u32 {
-    Z32_FLOAT = 0xA,
-    Z16_UNORM = 0x13,
-    S8_Z24_UNORM = 0x14,
-    Z24_X8_UNORM = 0x15,
-    Z24_S8_UNORM = 0x16,
-    Z24_C8_UNORM = 0x18,
-    Z32_S8_X24_FLOAT = 0x19,
+    D32_FLOAT = 0xA,
+    D16_UNORM = 0x13,
+    S8_UINT_Z24_UNORM = 0x14,
+    D24X8_UNORM = 0x15,
+    D24S8_UNORM = 0x16,
+    D24C8_UNORM = 0x18,
+    D32_FLOAT_S8X24_UINT = 0x19,
 };
 
 struct CommandListHeader;
@@ -95,9 +106,9 @@ class DebugContext;
  */
 struct FramebufferConfig {
     enum class PixelFormat : u32 {
-        ABGR8 = 1,
-        RGB565 = 4,
-        BGRA8 = 5,
+        A8B8G8R8_UNORM = 1,
+        RGB565_UNORM = 4,
+        B8G8R8A8_UNORM = 5,
     };
 
     VAddr address;
@@ -132,60 +143,102 @@ class MemoryManager;
 
 class GPU {
 public:
-    explicit GPU(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer,
-                 bool is_async);
-
-    virtual ~GPU();
-
     struct MethodCall {
         u32 method{};
         u32 argument{};
         u32 subchannel{};
         u32 method_count{};
 
-        bool IsLastCall() const {
-            return method_count <= 1;
-        }
-
         MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
             : method(method), argument(argument), subchannel(subchannel),
               method_count(method_count) {}
+
+        [[nodiscard]] bool IsLastCall() const {
+            return method_count <= 1;
+        }
     };
 
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
+    virtual ~GPU();
+
+    /// Binds a renderer to the GPU.
+    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
+
     /// Calls a GPU method.
     void CallMethod(const MethodCall& method_call);
 
+    /// Calls a GPU multivalue method.
+    void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                         u32 methods_pending);
+
+    /// Flush all current written commands into the host GPU for execution.
     void FlushCommands();
+    /// Synchronizes CPU writes with Host GPU memory.
+    void SyncGuestHost();
+    /// Signal the ending of command list.
+    virtual void OnCommandListEnd();
+
+    /// Request a host GPU memory flush from the CPU.
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
+
+    /// Obtains current flush request fence id.
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
+        return current_flush_fence.load(std::memory_order_relaxed);
+    }
+
+    /// Tick pending requests within the GPU.
+    void TickWork();
 
     /// Returns a reference to the Maxwell3D GPU engine.
-    Engines::Maxwell3D& Maxwell3D();
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D();
 
     /// Returns a const reference to the Maxwell3D GPU engine.
-    const Engines::Maxwell3D& Maxwell3D() const;
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    Engines::KeplerCompute& KeplerCompute();
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute();
 
     /// Returns a reference to the KeplerCompute GPU engine.
-    const Engines::KeplerCompute& KeplerCompute() const;
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
 
     /// Returns a reference to the GPU memory manager.
-    Tegra::MemoryManager& MemoryManager();
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager();
 
     /// Returns a const reference to the GPU memory manager.
-    const Tegra::MemoryManager& MemoryManager() const;
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
 
     /// Returns a reference to the GPU DMA pusher.
-    Tegra::DmaPusher& DmaPusher();
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher();
 
-    VideoCore::RendererBase& Renderer() {
+    /// Returns a const reference to the GPU DMA pusher.
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
+
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
         return *renderer;
     }
 
-    const VideoCore::RendererBase& Renderer() const {
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
         return *renderer;
     }
 
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
+        return *shader_notify;
+    }
+
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
+        return *shader_notify;
+    }
+
     // Waits for the GPU to finish working
     virtual void WaitIdle() const = 0;
 
@@ -194,27 +247,46 @@ public:
 
     void IncrementSyncPoint(u32 syncpoint_id);
 
-    u32 GetSyncpointValue(u32 syncpoint_id) const;
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
 
     void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
 
-    u64 GetTicks() const;
+    [[nodiscard]] u64 GetTicks() const;
 
-    std::unique_lock<std::mutex> LockSync() {
+    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
         return std::unique_lock{sync_mutex};
     }
 
-    bool IsAsync() const {
+    [[nodiscard]] bool IsAsync() const {
         return is_async;
     }
 
-    /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    [[nodiscard]] bool UseNvdec() const {
+        return use_nvdec;
+    }
+
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };
 
     struct Regs {
-        static constexpr size_t NUM_REGS = 0x100;
+        static constexpr size_t NUM_REGS = 0x40;
 
         union {
             struct {
@@ -223,7 +295,7 @@ public:
                     u32 address_high;
                     u32 address_low;
 
-                    GPUVAddr SemaphoreAddress() const {
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
@@ -233,7 +305,7 @@ public:
                 u32 semaphore_trigger;
                 INSERT_UNION_PADDING_WORDS(0xC);
 
-                // The puser and the puller share the reference counter, the pusher only has read
+                // The pusher and the puller share the reference counter, the pusher only has read
                 // access
                 u32 reference_count;
                 INSERT_UNION_PADDING_WORDS(0x5);
@@ -241,10 +313,7 @@ public:
                 u32 semaphore_acquire;
                 u32 semaphore_release;
                 u32 fence_value;
-                union {
-                    BitField<4, 4, u32> operation;
-                    BitField<8, 8, u32> id;
-                } fence_action;
+                FenceAction fence_action;
                 INSERT_UNION_PADDING_WORDS(0xE2);
 
                 // Puller state
@@ -263,9 +332,18 @@ public:
     /// core timing events.
     virtual void Start() = 0;
 
+    /// Obtain the CPU Context
+    virtual void ObtainContext() = 0;
+
+    /// Release the CPU Context
+    virtual void ReleaseContext() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
+    /// Push GPU command buffer entries to be processed
+    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
+
     /// Swap buffers (render frame)
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
@@ -283,6 +361,8 @@ protected:
 
 private:
     void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
     void ProcessSemaphoreTriggerMethod();
     void ProcessSemaphoreRelease();
     void ProcessSemaphoreAcquire();
@@ -293,17 +373,22 @@ private:
     /// Calls a GPU engine method.
     void CallEngineMethod(const MethodCall& method_call);
 
+    /// Calls a GPU engine multivalue method.
+    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                               u32 methods_pending);
+
     /// Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(const MethodCall& method_call);
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
 
 protected:
-    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
     Core::System& system;
+    std::unique_ptr<Tegra::MemoryManager> memory_manager;
+    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
     std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;
 
 private:
-    std::unique_ptr<Tegra::MemoryManager> memory_manager;
-
     /// Mapping of command subchannels to their bound engine ids
     std::array<EngineID, 8> bound_engines = {};
     /// 3D engine
@@ -316,15 +401,31 @@ private:
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
     /// Inline memory engine
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+    /// Shader build notifier
+    std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
 
     std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
 
     std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
 
     std::mutex sync_mutex;
+    std::mutex device_mutex;
 
     std::condition_variable sync_cv;
 
+    struct FlushRequest {
+        FlushRequest(u64 fence, VAddr addr, std::size_t size)
+            : fence{fence}, addr{addr}, size{size} {}
+        u64 fence;
+        VAddr addr;
+        std::size_t size;
+    };
+
+    std::list<FlushRequest> flush_requests;
+    std::atomic<u64> current_flush_fence{};
+    u64 last_flush_fence{};
+    std::mutex flush_request_mutex;
+
     const bool is_async;
 };
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 20e73a37e..a9baaf7ef 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -10,23 +10,50 @@
 
 namespace VideoCommon {
 
-GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer_,
-                     std::unique_ptr<Core::Frontend::GraphicsContext>&& context)
-    : GPU(system, std::move(renderer_), true), gpu_thread{system},
-      cpu_context(renderer->GetRenderWindow().CreateSharedContext()),
-      gpu_context(std::move(context)) {}
+GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec)
+    : GPU{system, true, use_nvdec}, gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
+    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
     cpu_context->MakeCurrent();
-    gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
+}
+
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
 }
 
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
 
+void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     gpu_thread.SwapBuffers(framebuffer);
 }
@@ -52,4 +79,8 @@ void GPUAsynch::WaitIdle() const {
     gpu_thread.WaitIdle();
 }
 
+void GPUAsynch::OnCommandListEnd() {
+    gpu_thread.OnCommandListEnd();
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 03fd0eef0..0c0872e73 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -20,25 +20,28 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch final : public Tegra::GPU {
 public:
-    explicit GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer,
-                       std::unique_ptr<Core::Frontend::GraphicsContext>&& context);
+    explicit GPUAsynch(Core::System& system, bool use_nvdec);
     ~GPUAsynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
     void WaitIdle() const override;
 
+    void OnCommandListEnd() override;
+
 protected:
     void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
 
 private:
     GPUThread::ThreadManager gpu_thread;
     std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
-    std::unique_ptr<Core::Frontend::GraphicsContext> gpu_context;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..ecf7bbdf3 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -7,14 +7,18 @@
 
 namespace VideoCommon {
 
-GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer,
-                   std::unique_ptr<Core::Frontend::GraphicsContext>&& context)
-    : GPU(system, std::move(renderer), false), context{std::move(context)} {}
+GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}
 
 GPUSynch::~GPUSynch() = default;
 
-void GPUSynch::Start() {
-    context->MakeCurrent();
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
+    renderer->Context().MakeCurrent();
+}
+
+void GPUSynch::ReleaseContext() {
+    renderer->Context().DoneCurrent();
 }
 
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
@@ -22,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->DispatchCalls();
 }
 
+void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clears all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     renderer->SwapBuffers(framebuffer);
 }
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..9d778c71a 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -19,12 +19,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU synchronously
 class GPUSynch final : public Tegra::GPU {
 public:
-    explicit GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer,
-                      std::unique_ptr<Core::Frontend::GraphicsContext>&& context);
+    explicit GPUSynch(Core::System& system, bool use_nvdec);
     ~GPUSynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
@@ -34,9 +36,6 @@ public:
 protected:
     void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
                              [[maybe_unused]] u32 value) const override {}
-
-private:
-    std::unique_ptr<Core::Frontend::GraphicsContext> context;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 10cda686b..4b8f58283 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,8 +4,10 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
+#include "core/settings.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
@@ -14,9 +16,14 @@
 namespace VideoCommon::GPUThread {
 
 /// Runs the GPU thread
-static void RunThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                      Tegra::DmaPusher& dma_pusher, SynchState& state) {
-    MicroProfileOnThreadCreate("GpuThread");
+static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
+                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
     while (state.queue.Empty())
@@ -35,12 +42,20 @@ static void RunThread(VideoCore::RendererBase& renderer, Core::Frontend::Graphic
         if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
             dma_pusher.Push(std::move(submit_list->entries));
             dma_pusher.DispatchCalls();
+        } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
+            // NVDEC
+            cdma_pusher.Push(std::move(command_list->entries));
+            cdma_pusher.DispatchCalls();
         } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
             renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
+        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
+            renderer.Rasterizer().ReleaseFences();
+        } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
+            system.GPU().TickWork();
         } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
             renderer.Rasterizer().FlushRegion(data->addr, data->size);
         } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
-            renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
+            renderer.Rasterizer().OnCPUWrite(data->addr, data->size);
         } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
             return;
         } else {
@@ -64,30 +79,47 @@ ThreadManager::~ThreadManager() {
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
-                                Tegra::DmaPusher& dma_pusher) {
-    thread = std::thread{RunThread, std::ref(renderer), std::ref(context), std::ref(dma_pusher),
-                         std::ref(state)};
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     PushCommand(SubmitListCommand(std::move(entries)));
 }
 
+void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
+    PushCommand(SubmitChCommandEntries(std::move(entries)));
+}
+
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
 }
 
 void ThreadManager::FlushRegion(VAddr addr, u64 size) {
-    PushCommand(FlushRegionCommand(addr, size));
+    if (!Settings::IsGPULevelHigh()) {
+        PushCommand(FlushRegionCommand(addr, size));
+        return;
+    }
+    if (!Settings::IsGPULevelExtreme()) {
+        return;
+    }
+    if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) {
+        auto& gpu = system.GPU();
+        u64 fence = gpu.RequestFlush(addr, size);
+        PushCommand(GPUTickCommand());
+        while (fence > gpu.CurrentFlushRequestFence()) {
+        }
+    }
 }
 
 void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
-    system.Renderer().Rasterizer().InvalidateRegion(addr, size);
+    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
 }
 
 void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
-    InvalidateRegion(addr, size);
+    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
 }
 
 void ThreadManager::WaitIdle() const {
@@ -95,6 +127,10 @@ void ThreadManager::WaitIdle() const {
     }
 }
 
+void ThreadManager::OnCommandListEnd() {
+    PushCommand(OnCommandListEndCommand());
+}
+
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index cd74ad330..32a34e3a7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -37,6 +37,14 @@ struct SubmitListCommand final {
     Tegra::CommandList entries;
 };
 
+/// Command to signal to the GPU thread that a cdma command list is ready for processing
+struct SubmitChCommandEntries final {
+    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries)
+        : entries{std::move(entries)} {}
+
+    Tegra::ChCommandHeaderList entries;
+};
+
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
     explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
@@ -70,9 +78,16 @@ struct FlushAndInvalidateRegionCommand final {
     u64 size;
 };
 
+/// Command called within the gpu, to schedule actions after a command list end
+struct OnCommandListEndCommand final {};
+
+/// Command to make the gpu look into pending requests
+struct GPUTickCommand final {};
+
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
 
 struct CommandDataContainer {
     CommandDataContainer() = default;
@@ -102,11 +117,14 @@ public:
 
     /// Creates and starts the GPU thread.
     void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                     Tegra::DmaPusher& dma_pusher);
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
 
     /// Push GPU command entries to be processed
     void SubmitList(Tegra::CommandList&& entries);
 
+    /// Push GPU CDMA command buffer entries to be processed
+    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
+
     /// Swap buffers (render frame)
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
 
@@ -122,6 +140,8 @@ public:
     // Wait until the gpu thread is idle.
     void WaitIdle() const;
 
+    void OnCommandListEnd();
+
 private:
     /// Pushes a command to be executed by the GPU thread
     u64 PushCommand(CommandData&& command_data);
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
new file mode 100644
index 000000000..c157724a9
--- /dev/null
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(SHADER_SOURCES
+    opengl_present.frag
+    opengl_present.vert
+)
+
+set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
+set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
+set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
+
+set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
+set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
+
+foreach(FILENAME IN ITEMS ${SHADER_SOURCES})
+    string(REPLACE "." "_" SHADER_NAME ${FILENAME})
+    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
+    set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
+    add_custom_command(
+        OUTPUT
+            ${HEADER_FILE}
+        COMMAND
+            ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE}
+        MAIN_DEPENDENCY
+            ${SOURCE_FILE}
+        DEPENDS
+            ${INPUT_FILE}
+            # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
+    )
+    set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE})
+endforeach()
+
+add_custom_target(host_shaders
+    DEPENDS
+        ${SHADER_HEADERS}
+    SOURCES
+        ${SHADER_SOURCES}
+)
diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake
new file mode 100644
index 000000000..c0fc49768
--- /dev/null
+++ b/src/video_core/host_shaders/StringShaderHeader.cmake
@@ -0,0 +1,13 @@
+set(SOURCE_FILE ${CMAKE_ARGV3})
+set(HEADER_FILE ${CMAKE_ARGV4})
+set(INPUT_FILE ${CMAKE_ARGV5})
+
+get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME)
+string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME})
+string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
+
+file(READ ${SOURCE_FILE} CONTENTS)
+
+get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
+make_directory(${OUTPUT_DIR})
+configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)
diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag
new file mode 100644
index 000000000..8a4cb024b
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -0,0 +1,10 @@
+#version 430 core
+
+layout (location = 0) in vec2 frag_tex_coord;
+layout (location = 0) out vec4 color;
+
+layout (binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
+}
diff --git a/src/video_core/host_shaders/opengl_present.vert b/src/video_core/host_shaders/opengl_present.vert
new file mode 100644
index 000000000..2235d31a4
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_present.vert
@@ -0,0 +1,24 @@
+#version 430 core
+
+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
+layout (location = 0) in vec2 vert_position;
+layout (location = 1) in vec2 vert_tex_coord;
+layout (location = 0) out vec2 frag_tex_coord;
+
+// This is a truncated 3x3 matrix for 2D transformations:
+// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
+// The third column performs translation.
+// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
+// implicitly be [0, 0, 1]
+layout (location = 0) uniform mat3x2 modelview_matrix;
+
+void main() {
+    // Multiply input position by the rotscale part of the matrix and then manually translate by
+    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
+    // to `vec3(vert_position.xy, 1.0)`
+    gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
+    frag_tex_coord = vert_tex_coord;
+}
diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in
new file mode 100644
index 000000000..ccdb0d2a9
--- /dev/null
+++ b/src/video_core/host_shaders/source_shader.h.in
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <string_view>
+
+namespace HostShaders {
+
+constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)";
+
+} // namespace HostShaders
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..cd21a2112
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <optional>
+#include <boost/container_hash/hash.hpp>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        std::optional<u32> mid_method;
+        const auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..31ee3440a
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,142 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class HLEMacro;
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     *
+     * @param parameters The parameters of the macro
+     * @param method     The method to execute
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..df00b57df
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,109 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0x3ffffff));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // Anonymous namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
+    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
+    {0x0217920100488FF7, &HLE_0217920100488FF7},
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 42031d80a..bd01fd1f2 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -6,109 +6,46 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"
 
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-namespace {
-enum class Operation : u32 {
-    ALU = 0,
-    AddImmediate = 1,
-    ExtractInsert = 2,
-    ExtractShiftLeftImmediate = 3,
-    ExtractShiftLeftRegister = 4,
-    Read = 5,
-    Unused = 6, // This operation doesn't seem to be a valid encoding.
-    Branch = 7,
-};
-} // Anonymous namespace
-
-enum class MacroInterpreter::ALUOperation : u32 {
-    Add = 0,
-    AddWithCarry = 1,
-    Subtract = 2,
-    SubtractWithBorrow = 3,
-    // Operations 4-7 don't seem to be valid encodings.
-    Xor = 8,
-    Or = 9,
-    And = 10,
-    AndNot = 11,
-    Nand = 12
-};
-
-enum class MacroInterpreter::ResultOperation : u32 {
-    IgnoreAndFetch = 0,
-    Move = 1,
-    MoveAndSetMethod = 2,
-    FetchAndSend = 3,
-    MoveAndSend = 4,
-    FetchAndSetMethod = 5,
-    MoveAndSetMethodFetchAndSend = 6,
-    MoveAndSetMethodSend = 7
-};
-
-enum class MacroInterpreter::BranchCondition : u32 {
-    Zero = 0,
-    NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
-    u32 raw;
-    BitField<0, 3, Operation> operation;
-    BitField<4, 3, ResultOperation> result_operation;
-    BitField<4, 1, BranchCondition> branch_condition;
-    // If set on a branch, then the branch doesn't have a delay slot.
-    BitField<5, 1, u32> branch_annul;
-    BitField<7, 1, u32> is_exit;
-    BitField<8, 3, u32> dst;
-    BitField<11, 3, u32> src_a;
-    BitField<14, 3, u32> src_b;
-    // The signed immediate overlaps the second source operand and the alu operation.
-    BitField<14, 18, s32> immediate;
-
-    BitField<17, 5, ALUOperation> alu_operation;
-
-    // Bitfield instructions data
-    BitField<17, 5, u32> bf_src_bit;
-    BitField<22, 5, u32> bf_size;
-    BitField<27, 5, u32> bf_dst_bit;
-
-    u32 GetBitfieldMask() const {
-        return (1 << bf_size) - 1;
-    }
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
-    s32 GetBranchTarget() const {
-        return static_cast<s32>(immediate * sizeof(u32));
-    }
-};
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
 
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+                                           const std::vector<u32>& code)
+    : maxwell3d(maxwell3d), code(code) {}
 
-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
     MICROPROFILE_SCOPE(MacroInterp);
     Reset();
 
     registers[1] = parameters[0];
+    num_parameters = parameters.size();
 
     if (num_parameters > parameters_capacity) {
         parameters_capacity = num_parameters;
         this->parameters = std::make_unique<u32[]>(num_parameters);
     }
-    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
-    this->num_parameters = num_parameters;
+    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
 
     // Execute the code until we hit an exit condition.
     bool keep_executing = true;
     while (keep_executing) {
-        keep_executing = Step(offset, false);
+        keep_executing = Step(false);
     }
 
     // Assert the the macro used all the input parameters
     ASSERT(next_parameter_index == num_parameters);
 }
 
-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
     registers = {};
     pc = 0;
     delayed_pc = {};
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
     carry_flag = false;
 }
 
-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
     u32 base_address = pc;
 
-    Opcode opcode = GetOpcode(offset);
+    Macro::Opcode opcode = GetOpcode();
     pc += 4;
 
     // Update the program counter if we were delayed
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     }
 
     switch (opcode.operation) {
-    case Operation::ALU: {
+    case Macro::Operation::ALU: {
         u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
                                   GetRegister(opcode.src_b));
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::AddImmediate: {
+    case Macro::Operation::AddImmediate: {
         ProcessResult(opcode.result_operation, opcode.dst,
                       GetRegister(opcode.src_a) + opcode.immediate);
         break;
     }
-    case Operation::ExtractInsert: {
+    case Macro::Operation::ExtractInsert: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, dst);
         break;
     }
-    case Operation::ExtractShiftLeftImmediate: {
+    case Macro::Operation::ExtractShiftLeftImmediate: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::ExtractShiftLeftRegister: {
+    case Macro::Operation::ExtractShiftLeftRegister: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Read: {
+    case Macro::Operation::Read: {
         u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Branch: {
+    case Macro::Operation::Branch: {
         ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
         u32 value = GetRegister(opcode.src_a);
         bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
 
             delayed_pc = base_address + opcode.GetBranchTarget();
             // Execute one more instruction due to the delay slot.
-            return Step(offset, true);
+            return Step(true);
         }
         break;
     }
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     // cause an exit if it's executed inside a delay slot.
     if (opcode.is_exit && !is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
-        Step(offset, true);
+        Step(true);
         return false;
     }
 
     return true;
 }
 
-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
-    const auto& macro_memory{maxwell3d.GetMacroMemory()};
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
-    return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
     switch (operation) {
-    case ALUOperation::Add: {
+    case Macro::ALUOperation::Add: {
         const u64 result{static_cast<u64>(src_a) + src_b};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::AddWithCarry: {
+    case Macro::ALUOperation::AddWithCarry: {
         const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Subtract: {
+    case Macro::ALUOperation::Subtract: {
         const u64 result{static_cast<u64>(src_a) - src_b};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::SubtractWithBorrow: {
+    case Macro::ALUOperation::SubtractWithBorrow: {
         const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Xor:
+    case Macro::ALUOperation::Xor:
         return src_a ^ src_b;
-    case ALUOperation::Or:
+    case Macro::ALUOperation::Or:
         return src_a | src_b;
-    case ALUOperation::And:
+    case Macro::ALUOperation::And:
         return src_a & src_b;
-    case ALUOperation::AndNot:
+    case Macro::ALUOperation::AndNot:
         return src_a & ~src_b;
-    case ALUOperation::Nand:
+    case Macro::ALUOperation::Nand:
         return ~(src_a & src_b);
 
     default:
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
     }
 }
 
-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
     switch (operation) {
-    case ResultOperation::IgnoreAndFetch:
+    case Macro::ResultOperation::IgnoreAndFetch:
         // Fetch parameter and ignore result.
         SetRegister(reg, FetchParameter());
         break;
-    case ResultOperation::Move:
+    case Macro::ResultOperation::Move:
         // Move result.
         SetRegister(reg, result);
         break;
-    case ResultOperation::MoveAndSetMethod:
+    case Macro::ResultOperation::MoveAndSetMethod:
         // Move result and use as Method Address.
         SetRegister(reg, result);
         SetMethodAddress(result);
         break;
-    case ResultOperation::FetchAndSend:
+    case Macro::ResultOperation::FetchAndSend:
         // Fetch parameter and send result.
         SetRegister(reg, FetchParameter());
         Send(result);
         break;
-    case ResultOperation::MoveAndSend:
+    case Macro::ResultOperation::MoveAndSend:
         // Move and send result.
         SetRegister(reg, result);
         Send(result);
         break;
-    case ResultOperation::FetchAndSetMethod:
+    case Macro::ResultOperation::FetchAndSetMethod:
         // Fetch parameter and use result as Method Address.
         SetRegister(reg, FetchParameter());
         SetMethodAddress(result);
         break;
-    case ResultOperation::MoveAndSetMethodFetchAndSend:
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
         // Move result and use as Method Address, then fetch and send parameter.
         SetRegister(reg, result);
         SetMethodAddress(result);
         Send(FetchParameter());
         break;
-    case ResultOperation::MoveAndSetMethodSend:
+    case Macro::ResultOperation::MoveAndSetMethodSend:
         // Move result and use as Method Address, then send bits 12:17 of result.
         SetRegister(reg, result);
         SetMethodAddress(result);
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
     }
 }
 
-u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
+}
+
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
 }
 
-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
     return registers.at(register_id);
 }
 
-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
     // Register 0 is hardwired as the zero register.
     // Ensure no writes to it actually occur.
     if (register_id == 0) {
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
     registers.at(register_id) = value;
 }
 
-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
     method_address.raw = address;
 }
 
-void MacroInterpreter::Send(u32 value) {
-    maxwell3d.CallMethodFromMME({method_address.address, value});
+void MacroInterpreterImpl::Send(u32 value) {
+    maxwell3d.CallMethodFromMME(method_address.address, value);
     // Increment the method address by the method increment.
     method_address.address.Assign(method_address.address.Value() +
                                   method_address.increment.Value());
 }
 
-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
     return maxwell3d.GetRegisterValue(method);
 }
 
-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case BranchCondition::Zero:
-        return value == 0;
-    case BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-    return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
 }
 
 } // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
index 631146d89..90217fc89 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
-
 #include <array>
 #include <optional>
-
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/macro/macro.h"
 
 namespace Tegra {
 namespace Engines {
 class Maxwell3D;
 }
 
-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
 public:
     explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
 
-    /**
-     * Executes the macro code with the specified input parameters.
-     * @param offset Offset to start execution at.
-     * @param parameters The parameters of the macro.
-     */
-    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
 
 private:
-    enum class ALUOperation : u32;
-    enum class BranchCondition : u32;
-    enum class ResultOperation : u32;
-
-    union Opcode;
+    Engines::Maxwell3D& maxwell3d;
+};
 
-    union MethodAddress {
-        u32 raw;
-        BitField<0, 12, u32> address;
-        BitField<12, 6, u32> increment;
-    };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
 
+private:
     /// Resets the execution engine state, zeroing registers, etc.
     void Reset();
 
@@ -49,20 +42,20 @@ private:
      * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
      * previous instruction.
      */
-    bool Step(u32 offset, bool is_delay_slot);
+    bool Step(bool is_delay_slot);
 
     /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
 
     /// Performs the result operation on the input result and stores it in the specified register
     /// (if necessary).
-    void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
 
     /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
 
     /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(u32 offset) const;
+    Macro::Opcode GetOpcode() const;
 
     /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
     u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
     /// Program counter to execute at after the delay slot is executed.
     std::optional<u32> delayed_pc;
 
-    static constexpr std::size_t NumMacroRegisters = 8;
-
     /// General purpose macro registers.
-    std::array<u32, NumMacroRegisters> registers = {};
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
 
     /// Method address to use for the next Send instruction.
-    MethodAddress method_address = {};
+    Macro::MethodAddress method_address = {};
 
     /// Input parameters of the current macro.
     std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
     u32 next_parameter_index = 0;
 
     bool carry_flag = false;
+    const std::vector<u32>& code;
 };
+
 } // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..954b87515
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,620 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
+constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    STATE,
+    RESULT,
+    PARAMETERS,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    program(&state, parameters.data());
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
+
+    Xbyak::Reg32 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, eax);
+        }
+    }
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, dst.cvt8());
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, dst.cvt8());
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg == 0) {
+            return;
+        }
+        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
+    };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..a180e7428
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,98 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index a3389d0d2..6e70bd362 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -4,186 +4,180 @@
 
 #include "common/alignment.h"
 #include "common/assert.h"
-#include "common/logging/log.h"
 #include "core/core.h"
+#include "core/hle/kernel/memory/page_table.h"
 #include "core/hle/kernel/process.h"
-#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra {
 
-MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
-    : rasterizer{rasterizer}, system{system} {
-    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
-    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
-              Common::PageType::Unmapped);
-    page_table.Resize(address_space_width);
+MemoryManager::MemoryManager(Core::System& system_)
+    : system{system_}, page_table(page_table_size) {}
 
-    // Initialize the map with a single free region covering the entire managed space.
-    VirtualMemoryArea initial_vma;
-    initial_vma.size = address_space_end;
-    vma_map.emplace(initial_vma.base, initial_vma);
+MemoryManager::~MemoryManager() = default;
 
-    UpdatePageTableForVMA(initial_vma);
+void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
 }
 
-MemoryManager::~MemoryManager() = default;
+GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
+    u64 remaining_size{size};
+    for (u64 offset{}; offset < size; offset += page_size) {
+        if (remaining_size < page_size) {
+            SetPageEntry(gpu_addr + offset, page_entry + offset, remaining_size);
+        } else {
+            SetPageEntry(gpu_addr + offset, page_entry + offset);
+        }
+        remaining_size -= page_size;
+    }
+    return gpu_addr;
+}
 
-GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) {
-    const u64 aligned_size{Common::AlignUp(size, page_size)};
-    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
+GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) {
+    return UpdateRange(gpu_addr, cpu_addr, size);
+}
 
-    AllocateMemory(gpu_addr, 0, aligned_size);
+GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align) {
+    return Map(cpu_addr, *FindFreeRange(size, align), size);
+}
 
-    return gpu_addr;
+GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
+    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
+    ASSERT(gpu_addr);
+    return Map(cpu_addr, *gpu_addr, size);
 }
 
-GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) {
-    const u64 aligned_size{Common::AlignUp(size, page_size)};
+void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
+    if (!size) {
+        return;
+    }
 
-    AllocateMemory(gpu_addr, 0, aligned_size);
+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
+    system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size);
 
-    return gpu_addr;
+    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
 }
 
-GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
-    const u64 aligned_size{Common::AlignUp(size, page_size)};
-    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
-
-    MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr);
-    ASSERT(system.CurrentProcess()
-               ->VMManager()
-               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
-                                   Kernel::MemoryAttribute::DeviceMapped)
-               .IsSuccess());
+std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) {
+    for (u64 offset{}; offset < size; offset += page_size) {
+        if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) {
+            return std::nullopt;
+        }
+    }
 
-    return gpu_addr;
+    return UpdateRange(gpu_addr, PageEntry::State::Allocated, size);
 }
 
-GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) {
-    ASSERT((gpu_addr & page_mask) == 0);
+GPUVAddr MemoryManager::Allocate(std::size_t size, std::size_t align) {
+    return *AllocateFixed(*FindFreeRange(size, align), size);
+}
 
-    const u64 aligned_size{Common::AlignUp(size, page_size)};
+void MemoryManager::TryLockPage(PageEntry page_entry, std::size_t size) {
+    if (!page_entry.IsValid()) {
+        return;
+    }
 
-    MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr);
     ASSERT(system.CurrentProcess()
-               ->VMManager()
-               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
-                                   Kernel::MemoryAttribute::DeviceMapped)
+               ->PageTable()
+               .LockForDeviceAddressSpace(page_entry.ToAddress(), size)
                .IsSuccess());
-    return gpu_addr;
 }
 
-GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
-    ASSERT((gpu_addr & page_mask) == 0);
-
-    const u64 aligned_size{Common::AlignUp(size, page_size)};
-    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
-    ASSERT(cpu_addr);
-
-    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-    system.GPU().FlushAndInvalidateRegion(*cpu_addr, aligned_size);
+void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) {
+    if (!page_entry.IsValid()) {
+        return;
+    }
 
-    UnmapRange(gpu_addr, aligned_size);
     ASSERT(system.CurrentProcess()
-               ->VMManager()
-               .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped,
-                                   Kernel::MemoryAttribute::None)
+               ->PageTable()
+               .UnlockForDeviceAddressSpace(page_entry.ToAddress(), size)
                .IsSuccess());
+}
 
-    return gpu_addr;
+PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const {
+    return page_table[PageEntryIndex(gpu_addr)];
 }
 
-GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) const {
-    // Find the first Free VMA.
-    const VMAHandle vma_handle{
-        std::find_if(vma_map.begin(), vma_map.end(), [region_start, size](const auto& vma) {
-            if (vma.second.type != VirtualMemoryArea::Type::Unmapped) {
-                return false;
-            }
+void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
+    // TODO(bunnei): We should lock/unlock device regions. This currently causes issues due to
+    // improper tracking, but should be fixed in the future.
 
-            const VAddr vma_end{vma.second.base + vma.second.size};
-            return vma_end > region_start && vma_end >= region_start + size;
-        })};
+    //// Unlock the old page
+    // TryUnlockPage(page_table[PageEntryIndex(gpu_addr)], size);
 
-    if (vma_handle == vma_map.end()) {
-        return {};
-    }
+    //// Lock the new page
+    // TryLockPage(page_entry, size);
 
-    return std::max(region_start, vma_handle->second.base);
+    page_table[PageEntryIndex(gpu_addr)] = page_entry;
 }
 
-bool MemoryManager::IsAddressValid(GPUVAddr addr) const {
-    return (addr >> page_bits) < page_table.pointers.size();
-}
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
+                                                     bool start_32bit_address) const {
+    if (!align) {
+        align = page_size;
+    } else {
+        align = Common::AlignUp(align, page_size);
+    }
 
-std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) const {
-    if (!IsAddressValid(addr)) {
-        return {};
+    u64 available_size{};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
+    while (gpu_addr + available_size < address_space_size) {
+        if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
+            available_size += page_size;
+
+            if (available_size >= size) {
+                return gpu_addr;
+            }
+        } else {
+            gpu_addr += available_size + page_size;
+            available_size = 0;
+
+            const auto remainder{gpu_addr % align};
+            if (remainder) {
+                gpu_addr = (gpu_addr - remainder) + align;
+            }
+        }
     }
 
-    const VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]};
-    if (cpu_addr) {
-        return cpu_addr + (addr & page_mask);
+    return std::nullopt;
+}
+
+std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
+    const auto page_entry{GetPageEntry(gpu_addr)};
+    if (!page_entry.IsValid()) {
+        return std::nullopt;
     }
 
-    return {};
+    return page_entry.ToAddress() + (gpu_addr & page_mask);
 }
 
 template <typename T>
 T MemoryManager::Read(GPUVAddr addr) const {
-    if (!IsAddressValid(addr)) {
-        return {};
-    }
-
-    const u8* page_pointer{GetPointer(addr)};
-    if (page_pointer) {
+    if (auto page_pointer{GetPointer(addr)}; page_pointer) {
         // NOTE: Avoid adding any extra logic to this fast-path block
         T value;
         std::memcpy(&value, page_pointer, sizeof(T));
         return value;
     }
 
-    switch (page_table.attributes[addr >> page_bits]) {
-    case Common::PageType::Unmapped:
-        LOG_ERROR(HW_GPU, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, addr);
-        return 0;
-    case Common::PageType::Memory:
-        ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr);
-        break;
-    default:
-        UNREACHABLE();
-    }
+    UNREACHABLE();
+
     return {};
 }
 
 template <typename T>
 void MemoryManager::Write(GPUVAddr addr, T data) {
-    if (!IsAddressValid(addr)) {
-        return;
-    }
-
-    u8* page_pointer{GetPointer(addr)};
-    if (page_pointer) {
+    if (auto page_pointer{GetPointer(addr)}; page_pointer) {
         // NOTE: Avoid adding any extra logic to this fast-path block
         std::memcpy(page_pointer, &data, sizeof(T));
         return;
     }
 
-    switch (page_table.attributes[addr >> page_bits]) {
-    case Common::PageType::Unmapped:
-        LOG_ERROR(HW_GPU, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                  static_cast<u32>(data), addr);
-        return;
-    case Common::PageType::Memory:
-        ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr);
-        break;
-    default:
-        UNREACHABLE();
-    }
+    UNREACHABLE();
 }
 
 template u8 MemoryManager::Read<u8>(GPUVAddr addr) const;
@@ -195,71 +189,48 @@ template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data);
 template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data);
 template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data);
 
-u8* MemoryManager::GetPointer(GPUVAddr addr) {
-    if (!IsAddressValid(addr)) {
+u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) {
+    if (!GetPageEntry(gpu_addr).IsValid()) {
         return {};
     }
 
-    auto& memory = system.Memory();
-
-    const VAddr page_addr{page_table.backing_addr[addr >> page_bits]};
-
-    if (page_addr != 0) {
-        return memory.GetPointer(page_addr + (addr & page_mask));
+    const auto address{GpuToCpuAddress(gpu_addr)};
+    if (!address) {
+        return {};
     }
 
-    LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr);
-    return {};
+    return system.Memory().GetPointer(*address);
 }
 
-const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
-    if (!IsAddressValid(addr)) {
+const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const {
+    if (!GetPageEntry(gpu_addr).IsValid()) {
         return {};
     }
 
-    const auto& memory = system.Memory();
-
-    const VAddr page_addr{page_table.backing_addr[addr >> page_bits]};
-
-    if (page_addr != 0) {
-        return memory.GetPointer(page_addr + (addr & page_mask));
+    const auto address{GpuToCpuAddress(gpu_addr)};
+    if (!address) {
+        return {};
     }
 
-    LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr);
-    return {};
-}
-
-bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t size) const {
-    const std::size_t inner_size = size - 1;
-    const GPUVAddr end = start + inner_size;
-    const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
-    const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
-    const auto range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
-    return range == inner_size;
+    return system.Memory().GetPointer(*address);
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
-
-    auto& memory = system.Memory();
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     while (remaining_size > 0) {
         const std::size_t copy_amount{
             std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
 
-        switch (page_table.attributes[page_index]) {
-        case Common::PageType::Memory: {
-            const VAddr src_addr{page_table.backing_addr[page_index] + page_offset};
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto src_addr{*page_addr + page_offset};
+
             // Flush must happen on the rasterizer interface, such that memory is always synchronous
             // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
-            rasterizer.FlushRegion(src_addr, copy_amount);
-            memory.ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
-            break;
-        }
-        default:
-            UNREACHABLE();
+            rasterizer->FlushRegion(src_addr, copy_amount);
+            system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
         }
 
         page_index++;
@@ -269,24 +240,23 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
     }
 }
 
-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                     const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
-
-    auto& memory = system.Memory();
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     while (remaining_size > 0) {
         const std::size_t copy_amount{
             std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
-        const u8* page_pointer = page_table.pointers[page_index];
-        if (page_pointer) {
-            const VAddr src_addr{page_table.backing_addr[page_index] + page_offset};
-            memory.ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto src_addr{*page_addr + page_offset};
+            system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
         } else {
             std::memset(dest_buffer, 0, copy_amount);
         }
+
         page_index++;
         page_offset = 0;
         dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
@@ -294,28 +264,22 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
-
-    auto& memory = system.Memory();
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     while (remaining_size > 0) {
         const std::size_t copy_amount{
             std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
 
-        switch (page_table.attributes[page_index]) {
-        case Common::PageType::Memory: {
-            const VAddr dest_addr{page_table.backing_addr[page_index] + page_offset};
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto dest_addr{*page_addr + page_offset};
+
             // Invalidate must happen on the rasterizer interface, such that memory is always
             // synchronous when it is written (even when in asynchronous GPU mode).
-            rasterizer.InvalidateRegion(dest_addr, copy_amount);
-            memory.WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
-            break;
-        }
-        default:
-            UNREACHABLE();
+            rasterizer->InvalidateRegion(dest_addr, copy_amount);
+            system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
         }
 
         page_index++;
@@ -325,22 +289,21 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
     }
 }
 
-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
-                                     const std::size_t size) {
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
-
-    auto& memory = system.Memory();
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     while (remaining_size > 0) {
         const std::size_t copy_amount{
             std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
-        u8* page_pointer = page_table.pointers[page_index];
-        if (page_pointer) {
-            const VAddr dest_addr{page_table.backing_addr[page_index] + page_offset};
-            memory.WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto dest_addr{*page_addr + page_offset};
+            system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
         }
+
         page_index++;
         page_offset = 0;
         src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
@@ -348,270 +311,26 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlock(src_addr, tmp_buffer.data(), size);
-    WriteBlock(dest_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
-    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
-}
-
-bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
-    const VAddr addr = page_table.backing_addr[gpu_addr >> page_bits];
-    const std::size_t page = (addr & Memory::PAGE_MASK) + size;
-    return page <= Memory::PAGE_SIZE;
-}
-
-void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
-                             VAddr backing_addr) {
-    LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
-              (base + size) * page_size);
-
-    const VAddr end{base + size};
-    ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}",
-               base + page_table.pointers.size());
-
-    std::fill(page_table.attributes.begin() + base, page_table.attributes.begin() + end, type);
-
-    if (memory == nullptr) {
-        std::fill(page_table.pointers.begin() + base, page_table.pointers.begin() + end, memory);
-        std::fill(page_table.backing_addr.begin() + base, page_table.backing_addr.begin() + end,
-                  backing_addr);
-    } else {
-        while (base != end) {
-            page_table.pointers[base] = memory;
-            page_table.backing_addr[base] = backing_addr;
-
-            base += 1;
-            memory += page_size;
-            backing_addr += page_size;
-        }
-    }
-}
-
-void MemoryManager::MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr) {
-    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size);
-    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base);
-    MapPages(base / page_size, size / page_size, target, Common::PageType::Memory, backing_addr);
-}
-
-void MemoryManager::UnmapRegion(GPUVAddr base, u64 size) {
-    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size);
-    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base);
-    MapPages(base / page_size, size / page_size, nullptr, Common::PageType::Unmapped);
-}
-
-bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
-    ASSERT(base + size == next.base);
-    if (type != next.type) {
-        return {};
-    }
-    if (type == VirtualMemoryArea::Type::Allocated && (offset + size != next.offset)) {
-        return {};
-    }
-    if (type == VirtualMemoryArea::Type::Mapped && backing_memory + size != next.backing_memory) {
-        return {};
-    }
-    return true;
-}
-
-MemoryManager::VMAHandle MemoryManager::FindVMA(GPUVAddr target) const {
-    if (target >= address_space_end) {
-        return vma_map.end();
-    } else {
-        return std::prev(vma_map.upper_bound(target));
-    }
-}
-
-MemoryManager::VMAIter MemoryManager::Allocate(VMAIter vma_handle) {
-    VirtualMemoryArea& vma{vma_handle->second};
-
-    vma.type = VirtualMemoryArea::Type::Allocated;
-    vma.backing_addr = 0;
-    vma.backing_memory = {};
-    UpdatePageTableForVMA(vma);
-
-    return MergeAdjacent(vma_handle);
-}
-
-MemoryManager::VMAHandle MemoryManager::AllocateMemory(GPUVAddr target, std::size_t offset,
-                                                       u64 size) {
-
-    // This is the appropriately sized VMA that will turn into our allocation.
-    VMAIter vma_handle{CarveVMA(target, size)};
-    VirtualMemoryArea& vma{vma_handle->second};
-
-    ASSERT(vma.size == size);
-
-    vma.offset = offset;
-
-    return Allocate(vma_handle);
-}
-
-MemoryManager::VMAHandle MemoryManager::MapBackingMemory(GPUVAddr target, u8* memory, u64 size,
-                                                         VAddr backing_addr) {
-    // This is the appropriately sized VMA that will turn into our allocation.
-    VMAIter vma_handle{CarveVMA(target, size)};
-    VirtualMemoryArea& vma{vma_handle->second};
-
-    ASSERT(vma.size == size);
-
-    vma.type = VirtualMemoryArea::Type::Mapped;
-    vma.backing_memory = memory;
-    vma.backing_addr = backing_addr;
-    UpdatePageTableForVMA(vma);
-
-    return MergeAdjacent(vma_handle);
-}
-
-void MemoryManager::UnmapRange(GPUVAddr target, u64 size) {
-    VMAIter vma{CarveVMARange(target, size)};
-    const VAddr target_end{target + size};
-    const VMAIter end{vma_map.end()};
-
-    // The comparison against the end of the range must be done using addresses since VMAs can be
-    // merged during this process, causing invalidation of the iterators.
-    while (vma != end && vma->second.base < target_end) {
-        // Unmapped ranges return to allocated state and can be reused
-        // This behavior is used by Super Mario Odyssey, Sonic Forces, and likely other games
-        vma = std::next(Allocate(vma));
-    }
-
-    ASSERT(FindVMA(target)->second.size >= size);
-}
-
-MemoryManager::VMAIter MemoryManager::StripIterConstness(const VMAHandle& iter) {
-    // This uses a neat C++ trick to convert a const_iterator to a regular iterator, given
-    // non-const access to its container.
-    return vma_map.erase(iter, iter); // Erases an empty range of elements
-}
-
-MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) {
-    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size);
-    ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: 0x{:016X}", base);
-
-    VMAIter vma_handle{StripIterConstness(FindVMA(base))};
-    if (vma_handle == vma_map.end()) {
-        // Target address is outside the managed range
-        return {};
-    }
-
-    const VirtualMemoryArea& vma{vma_handle->second};
-    if (vma.type == VirtualMemoryArea::Type::Mapped) {
-        // Region is already allocated
-        return vma_handle;
-    }
-
-    const VAddr start_in_vma{base - vma.base};
-    const VAddr end_in_vma{start_in_vma + size};
-
-    ASSERT_MSG(end_in_vma <= vma.size, "region size 0x{:016X} is less than required size 0x{:016X}",
-               vma.size, end_in_vma);
-
-    if (end_in_vma < vma.size) {
-        // Split VMA at the end of the allocated region
-        SplitVMA(vma_handle, end_in_vma);
-    }
-    if (start_in_vma != 0) {
-        // Split VMA at the start of the allocated region
-        vma_handle = SplitVMA(vma_handle, start_in_vma);
-    }
-
-    return vma_handle;
-}
-
-MemoryManager::VMAIter MemoryManager::CarveVMARange(GPUVAddr target, u64 size) {
-    ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size);
-    ASSERT_MSG((target & page_mask) == 0, "non-page aligned base: 0x{:016X}", target);
-
-    const VAddr target_end{target + size};
-    ASSERT(target_end >= target);
-    ASSERT(size > 0);
-
-    VMAIter begin_vma{StripIterConstness(FindVMA(target))};
-    const VMAIter i_end{vma_map.lower_bound(target_end)};
-    if (std::any_of(begin_vma, i_end, [](const auto& entry) {
-            return entry.second.type == VirtualMemoryArea::Type::Unmapped;
-        })) {
-        return {};
-    }
-
-    if (target != begin_vma->second.base) {
-        begin_vma = SplitVMA(begin_vma, target - begin_vma->second.base);
-    }
-
-    VMAIter end_vma{StripIterConstness(FindVMA(target_end))};
-    if (end_vma != vma_map.end() && target_end != end_vma->second.base) {
-        end_vma = SplitVMA(end_vma, target_end - end_vma->second.base);
-    }
-
-    return begin_vma;
-}
-
-MemoryManager::VMAIter MemoryManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) {
-    VirtualMemoryArea& old_vma{vma_handle->second};
-    VirtualMemoryArea new_vma{old_vma}; // Make a copy of the VMA
-
-    // For now, don't allow no-op VMA splits (trying to split at a boundary) because it's probably
-    // a bug. This restriction might be removed later.
-    ASSERT(offset_in_vma < old_vma.size);
-    ASSERT(offset_in_vma > 0);
-
-    old_vma.size = offset_in_vma;
-    new_vma.base += offset_in_vma;
-    new_vma.size -= offset_in_vma;
-
-    switch (new_vma.type) {
-    case VirtualMemoryArea::Type::Unmapped:
-        break;
-    case VirtualMemoryArea::Type::Allocated:
-        new_vma.offset += offset_in_vma;
-        break;
-    case VirtualMemoryArea::Type::Mapped:
-        new_vma.backing_memory += offset_in_vma;
-        break;
-    }
-
-    ASSERT(old_vma.CanBeMergedWith(new_vma));
-
-    return vma_map.emplace_hint(std::next(vma_handle), new_vma.base, new_vma);
-}
-
-MemoryManager::VMAIter MemoryManager::MergeAdjacent(VMAIter iter) {
-    const VMAIter next_vma{std::next(iter)};
-    if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) {
-        iter->second.size += next_vma->second.size;
-        vma_map.erase(next_vma);
-    }
-
-    if (iter != vma_map.begin()) {
-        VMAIter prev_vma{std::prev(iter)};
-        if (prev_vma->second.CanBeMergedWith(iter->second)) {
-            prev_vma->second.size += iter->second.size;
-            vma_map.erase(iter);
-            iter = prev_vma;
-        }
-    }
-
-    return iter;
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-void MemoryManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
-    switch (vma.type) {
-    case VirtualMemoryArea::Type::Unmapped:
-        UnmapRegion(vma.base, vma.size);
-        break;
-    case VirtualMemoryArea::Type::Allocated:
-        MapMemoryRegion(vma.base, vma.size, nullptr, vma.backing_addr);
-        break;
-    case VirtualMemoryArea::Type::Mapped:
-        MapMemoryRegion(vma.base, vma.size, vma.backing_memory, vma.backing_addr);
-        break;
+bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
+    const auto cpu_addr{GpuToCpuAddress(gpu_addr)};
+    if (!cpu_addr) {
+        return false;
     }
+    const std::size_t page{(*cpu_addr & Core::Memory::PAGE_MASK) + size};
+    return page <= Core::Memory::PAGE_SIZE;
 }
 
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 0d9468535..c078193d9 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -6,9 +6,9 @@
 
 #include <map>
 #include <optional>
+#include <vector>
 
 #include "common/common_types.h"
-#include "common/page_table.h"
 
 namespace VideoCore {
 class RasterizerInterface;
@@ -20,58 +20,70 @@ class System;
 
 namespace Tegra {
 
-/**
- * Represents a VMA in an address space. A VMA is a contiguous region of virtual addressing space
- * with homogeneous attributes across its extents. In this particular implementation each VMA is
- * also backed by a single host memory allocation.
- */
-struct VirtualMemoryArea {
-    enum class Type : u8 {
-        Unmapped,
-        Allocated,
-        Mapped,
+class PageEntry final {
+public:
+    enum class State : u32 {
+        Unmapped = static_cast<u32>(-1),
+        Allocated = static_cast<u32>(-2),
     };
 
-    /// Virtual base address of the region.
-    GPUVAddr base{};
-    /// Size of the region.
-    u64 size{};
-    /// Memory area mapping type.
-    Type type{Type::Unmapped};
-    /// CPU memory mapped address corresponding to this memory area.
-    VAddr backing_addr{};
-    /// Offset into the backing_memory the mapping starts from.
-    std::size_t offset{};
-    /// Pointer backing this VMA.
-    u8* backing_memory{};
-
-    /// Tests if this area can be merged to the right with `next`.
-    bool CanBeMergedWith(const VirtualMemoryArea& next) const;
+    constexpr PageEntry() = default;
+    constexpr PageEntry(State state) : state{state} {}
+    constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}
+
+    [[nodiscard]] constexpr bool IsUnmapped() const {
+        return state == State::Unmapped;
+    }
+
+    [[nodiscard]] constexpr bool IsAllocated() const {
+        return state == State::Allocated;
+    }
+
+    [[nodiscard]] constexpr bool IsValid() const {
+        return !IsUnmapped() && !IsAllocated();
+    }
+
+    [[nodiscard]] constexpr VAddr ToAddress() const {
+        if (!IsValid()) {
+            return {};
+        }
+
+        return static_cast<VAddr>(state) << ShiftBits;
+    }
+
+    [[nodiscard]] constexpr PageEntry operator+(u64 offset) const {
+        // If this is a reserved value, offsets do not apply
+        if (!IsValid()) {
+            return *this;
+        }
+        return PageEntry{(static_cast<VAddr>(state) << ShiftBits) + offset};
+    }
+
+private:
+    static constexpr std::size_t ShiftBits{12};
+
+    State state{State::Unmapped};
 };
+static_assert(sizeof(PageEntry) == 4, "PageEntry is too large");
 
 class MemoryManager final {
 public:
-    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(Core::System& system);
     ~MemoryManager();
 
-    GPUVAddr AllocateSpace(u64 size, u64 align);
-    GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
-    GPUVAddr MapBufferEx(VAddr cpu_addr, u64 size);
-    GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr addr, u64 size);
-    GPUVAddr UnmapBuffer(GPUVAddr addr, u64 size);
-    std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
+    /// Binds a renderer to the memory manager.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
 
     template <typename T>
-    T Read(GPUVAddr addr) const;
+    [[nodiscard]] T Read(GPUVAddr addr) const;
 
     template <typename T>
     void Write(GPUVAddr addr, T data);
 
-    u8* GetPointer(GPUVAddr addr);
-    const u8* GetPointer(GPUVAddr addr) const;
-
-    /// Returns true if the block is continuous in host memory, false otherwise
-    bool IsBlockContinuous(GPUVAddr start, std::size_t size) const;
+    [[nodiscard]] u8* GetPointer(GPUVAddr addr);
+    [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const;
 
     /**
      * ReadBlock and WriteBlock are full read and write operations over virtual
@@ -79,9 +91,9 @@ public:
      * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
      * Flushes and Invalidations, respectively to each operation.
      */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,97 +105,51 @@ public:
      * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
      * being flushed.
      */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
-
-    /**
-     * IsGranularRange checks if a gpu region can be simply read with a pointer
-     */
-    bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size);
-
-private:
-    using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
-    using VMAHandle = VMAMap::const_iterator;
-    using VMAIter = VMAMap::iterator;
-
-    bool IsAddressValid(GPUVAddr addr) const;
-    void MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
-                  VAddr backing_addr = 0);
-    void MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr);
-    void UnmapRegion(GPUVAddr base, u64 size);
-
-    /// Finds the VMA in which the given address is included in, or `vma_map.end()`.
-    VMAHandle FindVMA(GPUVAddr target) const;
-
-    VMAHandle AllocateMemory(GPUVAddr target, std::size_t offset, u64 size);
-
-    /**
-     * Maps an unmanaged host memory pointer at a given address.
-     *
-     * @param target       The guest address to start the mapping at.
-     * @param memory       The memory to be mapped.
-     * @param size         Size of the mapping in bytes.
-     * @param backing_addr The base address of the range to back this mapping.
-     */
-    VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr);
-
-    /// Unmaps a range of addresses, splitting VMAs as necessary.
-    void UnmapRange(GPUVAddr target, u64 size);
-
-    /// Converts a VMAHandle to a mutable VMAIter.
-    VMAIter StripIterConstness(const VMAHandle& iter);
-
-    /// Marks as the specified VMA as allocated.
-    VMAIter Allocate(VMAIter vma);
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
-     * Carves a VMA of a specific size at the specified address by splitting Free VMAs while doing
-     * the appropriate error checking.
+     * IsGranularRange checks if a gpu region can be simply read with a pointer.
      */
-    VMAIter CarveVMA(GPUVAddr base, u64 size);
+    [[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const;
 
-    /**
-     * Splits the edges of the given range of non-Free VMAs so that there is a VMA split at each
-     * end of the range.
-     */
-    VMAIter CarveVMARange(GPUVAddr base, u64 size);
-
-    /**
-     * Splits a VMA in two, at the specified offset.
-     * @returns the right side of the split, with the original iterator becoming the left side.
-     */
-    VMAIter SplitVMA(VMAIter vma, u64 offset_in_vma);
-
-    /**
-     * Checks for and merges the specified VMA with adjacent ones if possible.
-     * @returns the merged VMA or the original if no merging was possible.
-     */
-    VMAIter MergeAdjacent(VMAIter vma);
-
-    /// Updates the pages corresponding to this VMA so they match the VMA's attributes.
-    void UpdatePageTableForVMA(const VirtualMemoryArea& vma);
-
-    /// Finds a free (unmapped region) of the specified size starting at the specified address.
-    GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size) const;
+    [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
+    [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
+    void Unmap(GPUVAddr gpu_addr, std::size_t size);
 
 private:
+    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
+    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
+    GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
+                                                        bool start_32bit_address = false) const;
+
+    void TryLockPage(PageEntry page_entry, std::size_t size);
+    void TryUnlockPage(PageEntry page_entry, std::size_t size);
+
+    [[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
+        return (gpu_addr >> page_bits) & page_table_mask;
+    }
+
+    static constexpr u64 address_space_size = 1ULL << 40;
+    static constexpr u64 address_space_start = 1ULL << 32;
+    static constexpr u64 address_space_start_low = 1ULL << 16;
     static constexpr u64 page_bits{16};
     static constexpr u64 page_size{1 << page_bits};
     static constexpr u64 page_mask{page_size - 1};
+    static constexpr u64 page_table_bits{24};
+    static constexpr u64 page_table_size{1 << page_table_bits};
+    static constexpr u64 page_table_mask{page_table_size - 1};
 
-    /// Address space in bits, according to Tegra X1 TRM
-    static constexpr u32 address_space_width{40};
-    /// Start address for mapping, this is fairly arbitrary but must be non-zero.
-    static constexpr GPUVAddr address_space_base{0x100000};
-    /// End of address space, based on address space in bits.
-    static constexpr GPUVAddr address_space_end{1ULL << address_space_width};
+    Core::System& system;
 
-    Common::BackingPageTable page_table{page_bits};
-    VMAMap vma_map;
-    VideoCore::RasterizerInterface& rasterizer;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
 
-    Core::System& system;
+    std::vector<PageEntry> page_table;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index 6d522c318..9da9fb4ff 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -41,144 +41,168 @@ static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth
 }
 
 static constexpr ConversionArray morton_to_linear_fns = {
-    MortonCopy<true, PixelFormat::ABGR8U>,
-    MortonCopy<true, PixelFormat::ABGR8S>,
-    MortonCopy<true, PixelFormat::ABGR8UI>,
-    MortonCopy<true, PixelFormat::B5G6R5U>,
-    MortonCopy<true, PixelFormat::A2B10G10R10U>,
-    MortonCopy<true, PixelFormat::A1B5G5R5U>,
-    MortonCopy<true, PixelFormat::R8U>,
-    MortonCopy<true, PixelFormat::R8UI>,
-    MortonCopy<true, PixelFormat::RGBA16F>,
-    MortonCopy<true, PixelFormat::RGBA16U>,
-    MortonCopy<true, PixelFormat::RGBA16S>,
-    MortonCopy<true, PixelFormat::RGBA16UI>,
-    MortonCopy<true, PixelFormat::R11FG11FB10F>,
-    MortonCopy<true, PixelFormat::RGBA32UI>,
-    MortonCopy<true, PixelFormat::DXT1>,
-    MortonCopy<true, PixelFormat::DXT23>,
-    MortonCopy<true, PixelFormat::DXT45>,
-    MortonCopy<true, PixelFormat::DXN1>,
-    MortonCopy<true, PixelFormat::DXN2UNORM>,
-    MortonCopy<true, PixelFormat::DXN2SNORM>,
-    MortonCopy<true, PixelFormat::BC7U>,
-    MortonCopy<true, PixelFormat::BC6H_UF16>,
-    MortonCopy<true, PixelFormat::BC6H_SF16>,
-    MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
-    MortonCopy<true, PixelFormat::BGRA8>,
-    MortonCopy<true, PixelFormat::RGBA32F>,
-    MortonCopy<true, PixelFormat::RG32F>,
-    MortonCopy<true, PixelFormat::R32F>,
-    MortonCopy<true, PixelFormat::R16F>,
-    MortonCopy<true, PixelFormat::R16U>,
-    MortonCopy<true, PixelFormat::R16S>,
-    MortonCopy<true, PixelFormat::R16UI>,
-    MortonCopy<true, PixelFormat::R16I>,
-    MortonCopy<true, PixelFormat::RG16>,
-    MortonCopy<true, PixelFormat::RG16F>,
-    MortonCopy<true, PixelFormat::RG16UI>,
-    MortonCopy<true, PixelFormat::RG16I>,
-    MortonCopy<true, PixelFormat::RG16S>,
-    MortonCopy<true, PixelFormat::RGB32F>,
-    MortonCopy<true, PixelFormat::RGBA8_SRGB>,
-    MortonCopy<true, PixelFormat::RG8U>,
-    MortonCopy<true, PixelFormat::RG8S>,
-    MortonCopy<true, PixelFormat::RG32UI>,
-    MortonCopy<true, PixelFormat::RGBX16F>,
-    MortonCopy<true, PixelFormat::R32UI>,
-    MortonCopy<true, PixelFormat::R32I>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X4>,
-    MortonCopy<true, PixelFormat::BGRA8_SRGB>,
-    MortonCopy<true, PixelFormat::DXT1_SRGB>,
-    MortonCopy<true, PixelFormat::DXT23_SRGB>,
-    MortonCopy<true, PixelFormat::DXT45_SRGB>,
-    MortonCopy<true, PixelFormat::BC7U_SRGB>,
-    MortonCopy<true, PixelFormat::R4G4B4A4U>,
+    MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>,
+    MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>,
+    MortonCopy<true, PixelFormat::A8B8G8R8_SINT>,
+    MortonCopy<true, PixelFormat::A8B8G8R8_UINT>,
+    MortonCopy<true, PixelFormat::R5G6B5_UNORM>,
+    MortonCopy<true, PixelFormat::B5G6R5_UNORM>,
+    MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>,
+    MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>,
+    MortonCopy<true, PixelFormat::A2B10G10R10_UINT>,
+    MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>,
+    MortonCopy<true, PixelFormat::R8_UNORM>,
+    MortonCopy<true, PixelFormat::R8_SNORM>,
+    MortonCopy<true, PixelFormat::R8_SINT>,
+    MortonCopy<true, PixelFormat::R8_UINT>,
+    MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>,
+    MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>,
+    MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>,
+    MortonCopy<true, PixelFormat::R16G16B16A16_SINT>,
+    MortonCopy<true, PixelFormat::R16G16B16A16_UINT>,
+    MortonCopy<true, PixelFormat::B10G11R11_FLOAT>,
+    MortonCopy<true, PixelFormat::R32G32B32A32_UINT>,
+    MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>,
+    MortonCopy<true, PixelFormat::BC2_UNORM>,
+    MortonCopy<true, PixelFormat::BC3_UNORM>,
+    MortonCopy<true, PixelFormat::BC4_UNORM>,
+    MortonCopy<true, PixelFormat::BC4_SNORM>,
+    MortonCopy<true, PixelFormat::BC5_UNORM>,
+    MortonCopy<true, PixelFormat::BC5_SNORM>,
+    MortonCopy<true, PixelFormat::BC7_UNORM>,
+    MortonCopy<true, PixelFormat::BC6H_UFLOAT>,
+    MortonCopy<true, PixelFormat::BC6H_SFLOAT>,
+    MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>,
+    MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>,
+    MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>,
+    MortonCopy<true, PixelFormat::R32G32B32A32_SINT>,
+    MortonCopy<true, PixelFormat::R32G32_FLOAT>,
+    MortonCopy<true, PixelFormat::R32G32_SINT>,
+    MortonCopy<true, PixelFormat::R32_FLOAT>,
+    MortonCopy<true, PixelFormat::R16_FLOAT>,
+    MortonCopy<true, PixelFormat::R16_UNORM>,
+    MortonCopy<true, PixelFormat::R16_SNORM>,
+    MortonCopy<true, PixelFormat::R16_UINT>,
+    MortonCopy<true, PixelFormat::R16_SINT>,
+    MortonCopy<true, PixelFormat::R16G16_UNORM>,
+    MortonCopy<true, PixelFormat::R16G16_FLOAT>,
+    MortonCopy<true, PixelFormat::R16G16_UINT>,
+    MortonCopy<true, PixelFormat::R16G16_SINT>,
+    MortonCopy<true, PixelFormat::R16G16_SNORM>,
+    MortonCopy<true, PixelFormat::R32G32B32_FLOAT>,
+    MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>,
+    MortonCopy<true, PixelFormat::R8G8_UNORM>,
+    MortonCopy<true, PixelFormat::R8G8_SNORM>,
+    MortonCopy<true, PixelFormat::R8G8_SINT>,
+    MortonCopy<true, PixelFormat::R8G8_UINT>,
+    MortonCopy<true, PixelFormat::R32G32_UINT>,
+    MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>,
+    MortonCopy<true, PixelFormat::R32_UINT>,
+    MortonCopy<true, PixelFormat::R32_SINT>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>,
+    MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>,
+    MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>,
+    MortonCopy<true, PixelFormat::BC2_SRGB>,
+    MortonCopy<true, PixelFormat::BC3_SRGB>,
+    MortonCopy<true, PixelFormat::BC7_SRGB>,
+    MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>,
     MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X5>,
+    MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X8>,
+    MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X6>,
+    MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X10>,
+    MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_12X12>,
+    MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X6>,
+    MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X5>,
+    MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>,
     MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>,
-    MortonCopy<true, PixelFormat::E5B9G9R9F>,
-    MortonCopy<true, PixelFormat::Z32F>,
-    MortonCopy<true, PixelFormat::Z16>,
-    MortonCopy<true, PixelFormat::Z24S8>,
-    MortonCopy<true, PixelFormat::S8Z24>,
-    MortonCopy<true, PixelFormat::Z32FS8>,
+    MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>,
+    MortonCopy<true, PixelFormat::D32_FLOAT>,
+    MortonCopy<true, PixelFormat::D16_UNORM>,
+    MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>,
+    MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>,
+    MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>,
 };
 
 static constexpr ConversionArray linear_to_morton_fns = {
-    MortonCopy<false, PixelFormat::ABGR8U>,
-    MortonCopy<false, PixelFormat::ABGR8S>,
-    MortonCopy<false, PixelFormat::ABGR8UI>,
-    MortonCopy<false, PixelFormat::B5G6R5U>,
-    MortonCopy<false, PixelFormat::A2B10G10R10U>,
-    MortonCopy<false, PixelFormat::A1B5G5R5U>,
-    MortonCopy<false, PixelFormat::R8U>,
-    MortonCopy<false, PixelFormat::R8UI>,
-    MortonCopy<false, PixelFormat::RGBA16F>,
-    MortonCopy<false, PixelFormat::RGBA16S>,
-    MortonCopy<false, PixelFormat::RGBA16U>,
-    MortonCopy<false, PixelFormat::RGBA16UI>,
-    MortonCopy<false, PixelFormat::R11FG11FB10F>,
-    MortonCopy<false, PixelFormat::RGBA32UI>,
-    MortonCopy<false, PixelFormat::DXT1>,
-    MortonCopy<false, PixelFormat::DXT23>,
-    MortonCopy<false, PixelFormat::DXT45>,
-    MortonCopy<false, PixelFormat::DXN1>,
-    MortonCopy<false, PixelFormat::DXN2UNORM>,
-    MortonCopy<false, PixelFormat::DXN2SNORM>,
-    MortonCopy<false, PixelFormat::BC7U>,
-    MortonCopy<false, PixelFormat::BC6H_UF16>,
-    MortonCopy<false, PixelFormat::BC6H_SF16>,
+    MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>,
+    MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>,
+    MortonCopy<false, PixelFormat::A8B8G8R8_SINT>,
+    MortonCopy<false, PixelFormat::A8B8G8R8_UINT>,
+    MortonCopy<false, PixelFormat::R5G6B5_UNORM>,
+    MortonCopy<false, PixelFormat::B5G6R5_UNORM>,
+    MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>,
+    MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>,
+    MortonCopy<false, PixelFormat::A2B10G10R10_UINT>,
+    MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>,
+    MortonCopy<false, PixelFormat::R8_UNORM>,
+    MortonCopy<false, PixelFormat::R8_SNORM>,
+    MortonCopy<false, PixelFormat::R8_SINT>,
+    MortonCopy<false, PixelFormat::R8_UINT>,
+    MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>,
+    MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>,
+    MortonCopy<false, PixelFormat::R16G16B16A16_SINT>,
+    MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>,
+    MortonCopy<false, PixelFormat::R16G16B16A16_UINT>,
+    MortonCopy<false, PixelFormat::B10G11R11_FLOAT>,
+    MortonCopy<false, PixelFormat::R32G32B32A32_UINT>,
+    MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>,
+    MortonCopy<false, PixelFormat::BC2_UNORM>,
+    MortonCopy<false, PixelFormat::BC3_UNORM>,
+    MortonCopy<false, PixelFormat::BC4_UNORM>,
+    MortonCopy<false, PixelFormat::BC4_SNORM>,
+    MortonCopy<false, PixelFormat::BC5_UNORM>,
+    MortonCopy<false, PixelFormat::BC5_SNORM>,
+    MortonCopy<false, PixelFormat::BC7_UNORM>,
+    MortonCopy<false, PixelFormat::BC6H_UFLOAT>,
+    MortonCopy<false, PixelFormat::BC6H_SFLOAT>,
     // TODO(Subv): Swizzling ASTC formats are not supported
     nullptr,
-    MortonCopy<false, PixelFormat::BGRA8>,
-    MortonCopy<false, PixelFormat::RGBA32F>,
-    MortonCopy<false, PixelFormat::RG32F>,
-    MortonCopy<false, PixelFormat::R32F>,
-    MortonCopy<false, PixelFormat::R16F>,
-    MortonCopy<false, PixelFormat::R16U>,
-    MortonCopy<false, PixelFormat::R16S>,
-    MortonCopy<false, PixelFormat::R16UI>,
-    MortonCopy<false, PixelFormat::R16I>,
-    MortonCopy<false, PixelFormat::RG16>,
-    MortonCopy<false, PixelFormat::RG16F>,
-    MortonCopy<false, PixelFormat::RG16UI>,
-    MortonCopy<false, PixelFormat::RG16I>,
-    MortonCopy<false, PixelFormat::RG16S>,
-    MortonCopy<false, PixelFormat::RGB32F>,
-    MortonCopy<false, PixelFormat::RGBA8_SRGB>,
-    MortonCopy<false, PixelFormat::RG8U>,
-    MortonCopy<false, PixelFormat::RG8S>,
-    MortonCopy<false, PixelFormat::RG32UI>,
-    MortonCopy<false, PixelFormat::RGBX16F>,
-    MortonCopy<false, PixelFormat::R32UI>,
-    MortonCopy<false, PixelFormat::R32I>,
+    MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>,
+    MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>,
+    MortonCopy<false, PixelFormat::R32G32B32A32_SINT>,
+    MortonCopy<false, PixelFormat::R32G32_FLOAT>,
+    MortonCopy<false, PixelFormat::R32G32_SINT>,
+    MortonCopy<false, PixelFormat::R32_FLOAT>,
+    MortonCopy<false, PixelFormat::R16_FLOAT>,
+    MortonCopy<false, PixelFormat::R16_UNORM>,
+    MortonCopy<false, PixelFormat::R16_SNORM>,
+    MortonCopy<false, PixelFormat::R16_UINT>,
+    MortonCopy<false, PixelFormat::R16_SINT>,
+    MortonCopy<false, PixelFormat::R16G16_UNORM>,
+    MortonCopy<false, PixelFormat::R16G16_FLOAT>,
+    MortonCopy<false, PixelFormat::R16G16_UINT>,
+    MortonCopy<false, PixelFormat::R16G16_SINT>,
+    MortonCopy<false, PixelFormat::R16G16_SNORM>,
+    MortonCopy<false, PixelFormat::R32G32B32_FLOAT>,
+    MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>,
+    MortonCopy<false, PixelFormat::R8G8_UNORM>,
+    MortonCopy<false, PixelFormat::R8G8_SNORM>,
+    MortonCopy<false, PixelFormat::R8G8_SINT>,
+    MortonCopy<false, PixelFormat::R8G8_UINT>,
+    MortonCopy<false, PixelFormat::R32G32_UINT>,
+    MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>,
+    MortonCopy<false, PixelFormat::R32_UINT>,
+    MortonCopy<false, PixelFormat::R32_SINT>,
     nullptr,
     nullptr,
     nullptr,
-    MortonCopy<false, PixelFormat::BGRA8_SRGB>,
-    MortonCopy<false, PixelFormat::DXT1_SRGB>,
-    MortonCopy<false, PixelFormat::DXT23_SRGB>,
-    MortonCopy<false, PixelFormat::DXT45_SRGB>,
-    MortonCopy<false, PixelFormat::BC7U_SRGB>,
-    MortonCopy<false, PixelFormat::R4G4B4A4U>,
+    MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>,
+    MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>,
+    MortonCopy<false, PixelFormat::BC2_SRGB>,
+    MortonCopy<false, PixelFormat::BC3_SRGB>,
+    MortonCopy<false, PixelFormat::BC7_SRGB>,
+    MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>,
     nullptr,
     nullptr,
     nullptr,
@@ -197,12 +221,12 @@ static constexpr ConversionArray linear_to_morton_fns = {
     nullptr,
     nullptr,
     nullptr,
-    MortonCopy<false, PixelFormat::E5B9G9R9F>,
-    MortonCopy<false, PixelFormat::Z32F>,
-    MortonCopy<false, PixelFormat::Z16>,
-    MortonCopy<false, PixelFormat::Z24S8>,
-    MortonCopy<false, PixelFormat::S8Z24>,
-    MortonCopy<false, PixelFormat::Z32FS8>,
+    MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>,
+    MortonCopy<false, PixelFormat::D32_FLOAT>,
+    MortonCopy<false, PixelFormat::D16_UNORM>,
+    MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>,
+    MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>,
+    MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>,
 };
 
 static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) {
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 5ea2b01f2..fc54ca0ef 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -12,10 +12,12 @@
 #include <mutex>
 #include <optional>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/assert.h"
 #include "core/core.h"
+#include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
@@ -89,14 +91,15 @@ private:
     std::shared_ptr<HostCounter> last;
 };
 
-template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
-          class QueryPool>
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
 class QueryCacheBase {
 public:
-    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
-        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
-                                                      static_cast<QueryCache&>(*this),
-                                                      VideoCore::QueryType::SamplesPassed}}} {}
+    explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_,
+                            Tegra::Engines::Maxwell3D& maxwell3d_,
+                            Tegra::MemoryManager& gpu_memory_)
+        : rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+          gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this),
+                                                          VideoCore::QueryType::SamplesPassed}}} {}
 
     void InvalidateRegion(VAddr addr, std::size_t size) {
         std::unique_lock lock{mutex};
@@ -116,26 +119,27 @@ public:
      */
     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
         std::unique_lock lock{mutex};
-        auto& memory_manager = system.GPU().MemoryManager();
-        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr_opt);
-        VAddr cpu_addr = *cpu_addr_opt;
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
 
-        CachedQuery* query = TryGet(cpu_addr);
+        CachedQuery* query = TryGet(*cpu_addr);
         if (!query) {
-            ASSERT_OR_EXECUTE(cpu_addr_opt, return;);
-            const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+            u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
 
-            query = Register(type, cpu_addr, host_ptr, timestamp.has_value());
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
         }
 
         query->BindCounter(Stream(type).Current(), timestamp);
+        if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+            AsyncFlushQuery(*cpu_addr);
+        }
     }
 
     /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
     void UpdateCounters() {
         std::unique_lock lock{mutex};
-        const auto& regs = system.GPU().Maxwell3D().regs;
+        const auto& regs = maxwell3d.regs;
         Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
     }
 
@@ -170,8 +174,36 @@ public:
         return streams[static_cast<std::size_t>(type)];
     }
 
-protected:
-    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
+    void CommitAsyncFlushes() {
+        committed_flushes.push_back(uncommitted_flushes);
+        uncommitted_flushes.reset();
+    }
+
+    bool HasUncommittedFlushes() const {
+        return uncommitted_flushes != nullptr;
+    }
+
+    bool ShouldWaitAsyncFlushes() const {
+        if (committed_flushes.empty()) {
+            return false;
+        }
+        return committed_flushes.front() != nullptr;
+    }
+
+    void PopAsyncFlushes() {
+        if (committed_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = committed_flushes.front();
+        if (!flush_list) {
+            committed_flushes.pop_front();
+            return;
+        }
+        for (VAddr query_address : *flush_list) {
+            FlushAndRemoveRegion(query_address, 4);
+        }
+        committed_flushes.pop_front();
+    }
 
 private:
     /// Flushes a memory range to guest memory and removes it from the cache.
@@ -184,8 +216,8 @@ private:
             return cache_begin < addr_end && addr_begin < cache_end;
         };
 
-        const u64 page_end = addr_end >> PAGE_SHIFT;
-        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
             const auto& it = cached_queries.find(page);
             if (it == std::end(cached_queries)) {
                 continue;
@@ -206,14 +238,14 @@ private:
     /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
     CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
         rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
-        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
         return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
                                                   host_ptr);
     }
 
     /// Tries to a get a cached query. Returns nullptr on failure.
     CachedQuery* TryGet(VAddr addr) {
-        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
         const auto it = cached_queries.find(page);
         if (it == std::end(cached_queries)) {
             return nullptr;
@@ -224,17 +256,28 @@ private:
         return found != std::end(contents) ? &*found : nullptr;
     }
 
+    void AsyncFlushQuery(VAddr addr) {
+        if (!uncommitted_flushes) {
+            uncommitted_flushes = std::make_shared<std::unordered_set<VAddr>>();
+        }
+        uncommitted_flushes->insert(addr);
+    }
+
     static constexpr std::uintptr_t PAGE_SIZE = 4096;
-    static constexpr unsigned PAGE_SHIFT = 12;
+    static constexpr unsigned PAGE_BITS = 12;
 
-    Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::MemoryManager& gpu_memory;
 
     std::recursive_mutex mutex;
 
     std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
 
     std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+
+    std::shared_ptr<std::unordered_set<VAddr>> uncommitted_flushes{};
+    std::list<std::shared_ptr<std::unordered_set<VAddr>>> committed_flushes;
 };
 
 template <class QueryCache, class HostCounter>
diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp
index d01db97da..53622ca05 100644
--- a/src/video_core/rasterizer_accelerated.cpp
+++ b/src/video_core/rasterizer_accelerated.cpp
@@ -23,15 +23,15 @@ constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
 
 } // Anonymous namespace
 
-RasterizerAccelerated::RasterizerAccelerated(Memory::Memory& cpu_memory_)
+RasterizerAccelerated::RasterizerAccelerated(Core::Memory::Memory& cpu_memory_)
     : cpu_memory{cpu_memory_} {}
 
 RasterizerAccelerated::~RasterizerAccelerated() = default;
 
 void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
     std::lock_guard lock{pages_mutex};
-    const u64 page_start{addr >> Memory::PAGE_BITS};
-    const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
+    const u64 page_start{addr >> Core::Memory::PAGE_BITS};
+    const u64 page_end{(addr + size + Core::Memory::PAGE_SIZE - 1) >> Core::Memory::PAGE_BITS};
 
     // Interval maps will erase segments if count reaches 0, so if delta is negative we have to
     // subtract after iterating
@@ -44,8 +44,8 @@ void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int del
         const auto interval = pair.first & pages_interval;
         const int count = pair.second;
 
-        const VAddr interval_start_addr = boost::icl::first(interval) << Memory::PAGE_BITS;
-        const VAddr interval_end_addr = boost::icl::last_next(interval) << Memory::PAGE_BITS;
+        const VAddr interval_start_addr = boost::icl::first(interval) << Core::Memory::PAGE_BITS;
+        const VAddr interval_end_addr = boost::icl::last_next(interval) << Core::Memory::PAGE_BITS;
         const u64 interval_size = interval_end_addr - interval_start_addr;
 
         if (delta > 0 && count == delta) {
diff --git a/src/video_core/rasterizer_accelerated.h b/src/video_core/rasterizer_accelerated.h
index 315798e7c..91866d7dd 100644
--- a/src/video_core/rasterizer_accelerated.h
+++ b/src/video_core/rasterizer_accelerated.h
@@ -11,7 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/rasterizer_interface.h"
 
-namespace Memory {
+namespace Core::Memory {
 class Memory;
 }
 
@@ -20,7 +20,7 @@ namespace VideoCore {
 /// Implements the shared part in GPU accelerated rasterizers in RasterizerInterface.
 class RasterizerAccelerated : public RasterizerInterface {
 public:
-    explicit RasterizerAccelerated(Memory::Memory& cpu_memory_);
+    explicit RasterizerAccelerated(Core::Memory::Memory& cpu_memory_);
     ~RasterizerAccelerated() override;
 
     void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
@@ -30,7 +30,7 @@ private:
     CachedPageMap cached_pages;
     std::mutex pages_mutex;
 
-    Memory::Memory& cpu_memory;
+    Core::Memory::Memory& cpu_memory;
 };
 
 } // namespace VideoCore
diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp
deleted file mode 100644
index 093b2cdf4..000000000
--- a/src/video_core/rasterizer_cache.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/rasterizer_cache.h"
-
-RasterizerCacheObject::~RasterizerCacheObject() = default;
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
deleted file mode 100644
index 22987751e..000000000
--- a/src/video_core/rasterizer_cache.h
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-
-#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range_core.hpp>
-
-#include "common/common_types.h"
-#include "core/settings.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
-
-class RasterizerCacheObject {
-public:
-    explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
-
-    virtual ~RasterizerCacheObject();
-
-    VAddr GetCpuAddr() const {
-        return cpu_addr;
-    }
-
-    /// Gets the size of the shader in guest memory, required for cache management
-    virtual std::size_t GetSizeInBytes() const = 0;
-
-    /// Sets whether the cached object should be considered registered
-    void SetIsRegistered(bool registered) {
-        is_registered = registered;
-    }
-
-    /// Returns true if the cached object is registered
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    /// Returns true if the cached object is dirty
-    bool IsDirty() const {
-        return is_dirty;
-    }
-
-    /// Returns ticks from when this cached object was last modified
-    u64 GetLastModifiedTicks() const {
-        return last_modified_ticks;
-    }
-
-    /// Marks an object as recently modified, used to specify whether it is clean or dirty
-    template <class T>
-    void MarkAsModified(bool dirty, T& cache) {
-        is_dirty = dirty;
-        last_modified_ticks = cache.GetModifiedTicks();
-    }
-
-private:
-    bool is_registered{};      ///< Whether the object is currently registered with the cache
-    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
-    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
-    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
-};
-
-template <class T>
-class RasterizerCache : NonCopyable {
-    friend class RasterizerCacheObject;
-
-public:
-    explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
-
-    /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            FlushObject(object);
-        }
-    }
-
-    /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            if (!object->IsRegistered()) {
-                // Skip duplicates
-                continue;
-            }
-            Unregister(object);
-        }
-    }
-
-    /// Invalidates everything in the cache
-    void InvalidateAll() {
-        std::lock_guard lock{mutex};
-
-        while (interval_cache.begin() != interval_cache.end()) {
-            Unregister(*interval_cache.begin()->second.begin());
-        }
-    }
-
-protected:
-    /// Tries to get an object from the cache with the specified cache address
-    T TryGet(VAddr addr) const {
-        const auto iter = map_cache.find(addr);
-        if (iter != map_cache.end())
-            return iter->second;
-        return nullptr;
-    }
-
-    /// Register an object into the cache
-    virtual void Register(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(true);
-        interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetCpuAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
-    }
-
-    /// Unregisters an object from the cache
-    virtual void Unregister(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
-        const VAddr addr = object->GetCpuAddr();
-        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(addr);
-    }
-
-    /// Returns a ticks counter used for tracking when cached objects were last modified
-    u64 GetModifiedTicks() {
-        std::lock_guard lock{mutex};
-
-        return ++modified_ticks;
-    }
-
-    virtual void FlushObjectInner(const T& object) = 0;
-
-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        std::lock_guard lock{mutex};
-
-        if (!object->IsDirty()) {
-            return;
-        }
-        FlushObjectInner(object);
-        object->MarkAsModified(false, *this);
-    }
-
-    std::recursive_mutex mutex;
-
-private:
-    /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
-        if (size == 0) {
-            return {};
-        }
-
-        std::vector<T> objects;
-        const ObjectInterval interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
-            for (auto& cached_object : pair.second) {
-                if (!cached_object) {
-                    continue;
-                }
-                objects.push_back(cached_object);
-            }
-        }
-
-        std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
-            return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
-        });
-
-        return objects;
-    }
-
-    using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
-    using ObjectInterval = typename IntervalCache::interval_type;
-
-    static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetCpuAddr(),
-                                          object->GetCpuAddr() + object->GetSizeInBytes());
-    }
-
-    ObjectCache map_cache;
-    IntervalCache interval_cache; ///< Cache of objects
-    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
-    VideoCore::RasterizerInterface& rasterizer;
-};
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 8ae5b9c4e..27ef4c69a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size
 
 class RasterizerInterface {
 public:
-    virtual ~RasterizerInterface() {}
+    virtual ~RasterizerInterface() = default;
 
     /// Dispatches a draw invocation
     virtual void Draw(bool is_indexed, bool is_instanced) = 0;
@@ -49,19 +49,40 @@ public:
     /// Records a GPU query and caches it
     virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
 
+    /// Signal a GPU based semaphore as a fence
+    virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;
+
+    /// Signal a GPU based syncpoint as a fence
+    virtual void SignalSyncPoint(u32 value) = 0;
+
+    /// Release all pending fences.
+    virtual void ReleaseFences() = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     virtual void FlushRegion(VAddr addr, u64 size) = 0;
 
+    /// Check if the the specified memory area requires flushing to CPU Memory.
+    virtual bool MustFlushRegion(VAddr addr, u64 size) = 0;
+
     /// Notify rasterizer that any caches of the specified region should be invalidated
     virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
 
+    /// Notify rasterizer that any caches of the specified region are desync with guest
+    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
+
+    /// Sync memory between guest and host.
+    virtual void SyncGuestHost() = 0;
+
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     /// and invalidated
     virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
 
+    /// Notify the host renderer to wait for previous primitive and compute operations.
+    virtual void WaitForIdle() = 0;
+
     /// Notify the rasterizer to send all written commands to the host GPU.
     virtual void FlushCommands() = 0;
 
@@ -69,15 +90,16 @@ public:
     virtual void TickFrame() = 0;
 
     /// Attempt to use a faster method to perform a surface copy
-    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
-                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                                       const Tegra::Engines::Fermi2D::Config& copy_config) {
+    [[nodiscard]] virtual bool AccelerateSurfaceCopy(
+        const Tegra::Engines::Fermi2D::Regs::Surface& src,
+        const Tegra::Engines::Fermi2D::Regs::Surface& dst,
+        const Tegra::Engines::Fermi2D::Config& copy_config) {
         return false;
     }
 
     /// Attempt to use a faster method to display the framebuffer to screen
-    virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
-                                   u32 pixel_stride) {
+    [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
+                                                 VAddr framebuffer_addr, u32 pixel_stride) {
         return false;
     }
 
@@ -85,19 +107,16 @@ public:
     virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
 
     /// Initialize disk cached resources for the game being emulated
-    virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
-                                   const DiskResourceLoadCallback& callback = {}) {}
-
-    /// Initializes renderer dirty flags
-    virtual void SetupDirtyFlags() {}
+    virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+                                   const DiskResourceLoadCallback& callback) {}
 
     /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    GuestDriverProfile& AccessGuestDriverProfile() {
+    [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
         return guest_driver_profile;
     }
 
     /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    const GuestDriverProfile& AccessGuestDriverProfile() const {
+    [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
         return guest_driver_profile;
     }
 
diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp
index 919d1f2d4..a93a1732c 100644
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -9,7 +9,9 @@
 
 namespace VideoCore {
 
-RendererBase::RendererBase(Core::Frontend::EmuWindow& window) : render_window{window} {
+RendererBase::RendererBase(Core::Frontend::EmuWindow& window_,
+                           std::unique_ptr<Core::Frontend::GraphicsContext> context_)
+    : render_window{window_}, context{std::move(context_)} {
     RefreshBaseSettings();
 }
 
@@ -18,7 +20,7 @@ RendererBase::~RendererBase() = default;
 void RendererBase::RefreshBaseSettings() {
     UpdateCurrentFramebufferLayout();
 
-    renderer_settings.use_framelimiter = Settings::values.use_frame_limit;
+    renderer_settings.use_framelimiter = Settings::values.use_frame_limit.GetValue();
     renderer_settings.set_background_color = true;
 }
 
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 1d85219b6..51dde8eb5 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -15,7 +15,8 @@
 
 namespace Core::Frontend {
 class EmuWindow;
-}
+class GraphicsContext;
+} // namespace Core::Frontend
 
 namespace VideoCore {
 
@@ -25,18 +26,19 @@ struct RendererSettings {
 
     // Screenshot
     std::atomic<bool> screenshot_requested{false};
-    void* screenshot_bits;
+    void* screenshot_bits{};
     std::function<void()> screenshot_complete_callback;
     Layout::FramebufferLayout screenshot_framebuffer_layout;
 };
 
 class RendererBase : NonCopyable {
 public:
-    explicit RendererBase(Core::Frontend::EmuWindow& window);
+    explicit RendererBase(Core::Frontend::EmuWindow& window,
+                          std::unique_ptr<Core::Frontend::GraphicsContext> context);
     virtual ~RendererBase();
 
     /// Initialize the renderer
-    virtual bool Init() = 0;
+    [[nodiscard]] virtual bool Init() = 0;
 
     /// Shutdown the renderer
     virtual void ShutDown() = 0;
@@ -44,43 +46,46 @@ public:
     /// Finalize rendering the guest frame and draw into the presentation texture
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
-    /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer
-    /// specific implementation)
-    /// Returns true if a frame was drawn
-    virtual bool TryPresent(int timeout_ms) = 0;
-
     // Getter/setter functions:
     // ------------------------
 
-    f32 GetCurrentFPS() const {
+    [[nodiscard]] f32 GetCurrentFPS() const {
         return m_current_fps;
     }
 
-    int GetCurrentFrame() const {
+    [[nodiscard]] int GetCurrentFrame() const {
         return m_current_frame;
     }
 
-    RasterizerInterface& Rasterizer() {
+    [[nodiscard]] RasterizerInterface& Rasterizer() {
         return *rasterizer;
     }
 
-    const RasterizerInterface& Rasterizer() const {
+    [[nodiscard]] const RasterizerInterface& Rasterizer() const {
         return *rasterizer;
     }
 
-    Core::Frontend::EmuWindow& GetRenderWindow() {
+    [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
+        return *context;
+    }
+
+    [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
+        return *context;
+    }
+
+    [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
         return render_window;
     }
 
-    const Core::Frontend::EmuWindow& GetRenderWindow() const {
+    [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
         return render_window;
     }
 
-    RendererSettings& Settings() {
+    [[nodiscard]] RendererSettings& Settings() {
         return renderer_settings;
     }
 
-    const RendererSettings& Settings() const {
+    [[nodiscard]] const RendererSettings& Settings() const {
         return renderer_settings;
     }
 
@@ -94,6 +99,7 @@ public:
 protected:
     Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
     std::unique_ptr<RasterizerInterface> rasterizer;
+    std::unique_ptr<Core::Frontend::GraphicsContext> context;
     f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
     int m_current_frame = 0;  ///< Current frame, should be set by the renderer
 
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..d6120c23e
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2126 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+    static constexpr std::string_view SWIZZLE{"xyzw"};
+    return SWIZZLE.at(component);
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+    const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+    if (meta && meta->precise) {
+        return ".PREC";
+    }
+    return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+    switch (attribute) {
+    case PixelImap::Perspective:
+        return "";
+    case PixelImap::Constant:
+        return "FLAT ";
+    case PixelImap::ScreenLinear:
+        return "NOPERSPECTIVE ";
+    case PixelImap::Unused:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return "1D";
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return "BUFFER";
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return "ARRAY1D";
+    case Tegra::Shader::ImageType::Texture2D:
+        return "2D";
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return "ARRAY2D";
+    case Tegra::Shader::ImageType::Texture3D:
+        return "3D";
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "SSY";
+    case MetaStackClass::Pbk:
+        return "PBK";
+    }
+    UNREACHABLE();
+    return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+        return "POINTS";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+        return "LINES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+        return "LINES_ADJACENCY";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+        return "TRIANGLES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+        return "TRIANGLES_ADJACENCY";
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return "POINTS";
+    }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+    switch (topology) {
+    case Tegra::Shader::OutputTopology::PointList:
+        return "POINTS";
+    case Tegra::Shader::OutputTopology::LineStrip:
+        return "LINE_STRIP";
+    case Tegra::Shader::OutputTopology::TriangleStrip:
+        return "TRIANGLE_STRIP";
+    default:
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        return "points";
+    }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+    case ShaderType::Geometry:
+        return "vertex";
+    case ShaderType::Fragment:
+        return "fragment";
+    case ShaderType::Compute:
+        return "invocation";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+    if (meta.sampler.is_buffer) {
+        return "BUFFER";
+    }
+    std::string type;
+    if (meta.sampler.is_shadow) {
+        type += "SHADOW";
+    }
+    if (meta.sampler.is_array) {
+        type += "ARRAY";
+    }
+    type += [&meta] {
+        switch (meta.sampler.type) {
+        case Tegra::Shader::TextureType::Texture1D:
+            return "1D";
+        case Tegra::Shader::TextureType::Texture2D:
+            return "2D";
+        case Tegra::Shader::TextureType::Texture3D:
+            return "3D";
+        case Tegra::Shader::TextureType::TextureCube:
+            return "CUBE";
+        }
+        UNREACHABLE();
+        return "2D";
+    }();
+    return type;
+}
+
+class ARBDecompiler final {
+public:
+    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                           ShaderType stage, std::string_view identifier);
+
+    std::string Code() const {
+        return shader_source;
+    }
+
+private:
+    void DefineGlobalMemory();
+
+    void DeclareHeader();
+    void DeclareVertex();
+    void DeclareGeometry();
+    void DeclareFragment();
+    void DeclareCompute();
+    void DeclareInputAttributes();
+    void DeclareOutputAttributes();
+    void DeclareLocalMemory();
+    void DeclareGlobalMemory();
+    void DeclareConstantBuffers();
+    void DeclareRegisters();
+    void DeclareTemporaries();
+    void DeclarePredicates();
+    void DeclareInternalFlags();
+
+    void InitializeVariables();
+
+    void DecompileAST();
+    void DecompileBranchMode();
+
+    void VisitAST(const ASTNode& node);
+    std::string VisitExpression(const Expr& node);
+
+    void VisitBlock(const NodeBlock& bb);
+
+    std::string Visit(const Node& node);
+
+    std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
+    std::string BuildAoffi(Operation);
+    std::string GlobalMemoryPointer(const GmemNode& gmem);
+    void Exit();
+
+    std::string Assign(Operation);
+    std::string Select(Operation);
+    std::string FClamp(Operation);
+    std::string FCastHalf0(Operation);
+    std::string FCastHalf1(Operation);
+    std::string FSqrt(Operation);
+    std::string FSwizzleAdd(Operation);
+    std::string HAdd2(Operation);
+    std::string HMul2(Operation);
+    std::string HFma2(Operation);
+    std::string HAbsolute(Operation);
+    std::string HNegate(Operation);
+    std::string HClamp(Operation);
+    std::string HCastFloat(Operation);
+    std::string HUnpack(Operation);
+    std::string HMergeF32(Operation);
+    std::string HMergeH0(Operation);
+    std::string HMergeH1(Operation);
+    std::string HPack2(Operation);
+    std::string LogicalAssign(Operation);
+    std::string LogicalPick2(Operation);
+    std::string LogicalAnd2(Operation);
+    std::string FloatOrdered(Operation);
+    std::string FloatUnordered(Operation);
+    std::string LogicalAddCarry(Operation);
+    std::string Texture(Operation);
+    std::string TextureGather(Operation);
+    std::string TextureQueryDimensions(Operation);
+    std::string TextureQueryLod(Operation);
+    std::string TexelFetch(Operation);
+    std::string TextureGradient(Operation);
+    std::string ImageLoad(Operation);
+    std::string ImageStore(Operation);
+    std::string Branch(Operation);
+    std::string BranchIndirect(Operation);
+    std::string PushFlowStack(Operation);
+    std::string PopFlowStack(Operation);
+    std::string Exit(Operation);
+    std::string Discard(Operation);
+    std::string EmitVertex(Operation);
+    std::string EndPrimitive(Operation);
+    std::string InvocationId(Operation);
+    std::string YNegate(Operation);
+    std::string ThreadId(Operation);
+    std::string ShuffleIndexed(Operation);
+    std::string Barrier(Operation);
+    std::string MemoryBarrierGroup(Operation);
+    std::string MemoryBarrierGlobal(Operation);
+
+    template <const std::string_view& op>
+    std::string Unary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Binary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Trinary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]), Visit(operation[2]));
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool unordered>
+    std::string FloatComparison(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+        AddLine("MOV.S {}, 0;", temporary);
+        AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        if constexpr (unordered) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+        } else if (op == SNE_F) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+        }
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool is_nan>
+    std::string HalfComparison(Operation operation) {
+        std::string tmp1 = AllocVectorTemporary();
+        const std::string tmp2 = AllocVectorTemporary();
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        AddLine("UP2H.F {}, {};", tmp1, op_a);
+        AddLine("UP2H.F {}, {};", tmp2, op_b);
+        AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+        AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+        AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+        AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+        AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+        if constexpr (is_nan) {
+            AddLine("MOVC.F RC.x, {};", op_a);
+            AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+            AddLine("MOVC.F RC.x, {};", op_b);
+            AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+        }
+        return tmp1;
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string AtomicImage(Operation operation) {
+        const auto& meta = std::get<MetaImage>(operation.GetMeta());
+        const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+        const std::size_t num_coords = operation.GetOperandsCount();
+        const std::size_t num_values = meta.values.size();
+
+        const std::string coord = AllocVectorTemporary();
+        const std::string value = AllocVectorTemporary();
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+        }
+        for (std::size_t i = 0; i < num_values; ++i) {
+            AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+        }
+
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+                image_id, ImageType(meta.image.type));
+        return fmt::format("{}.x", coord);
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string Atomic(Operation operation) {
+        std::string temporary = AllocTemporary();
+        std::string address;
+        std::string_view opname;
+        bool robust = false;
+        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            address = GlobalMemoryPointer(*gmem);
+            opname = "ATOM";
+            robust = true;
+        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+            opname = "ATOMS";
+        } else {
+            UNREACHABLE();
+            return "{0, 0, 0, 0}";
+        }
+        if (robust) {
+            AddLine("IF NE.x;");
+        }
+        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        if (robust) {
+            AddLine("ELSE;");
+            AddLine("MOV.S {}, 0;", temporary);
+            AddLine("ENDIF;");
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Negate(Operation operation) {
+        std::string temporary = AllocTemporary();
+        if constexpr (type == 'F') {
+            AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+        } else {
+            AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Absolute(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <char type>
+    std::string BitfieldInsert(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+        AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+                Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char type>
+    std::string BitfieldExtract(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+        AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char swizzle>
+    std::string LocalInvocationId(Operation) {
+        return fmt::format("invocation.localid.{}", swizzle);
+    }
+
+    template <char swizzle>
+    std::string WorkGroupId(Operation) {
+        return fmt::format("invocation.groupid.{}", swizzle);
+    }
+
+    template <char c1, char c2>
+    std::string ThreadMask(Operation) {
+        return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+    }
+
+    template <typename... Args>
+    void AddExpression(std::string_view text, Args&&... args) {
+        shader_source += fmt::format(text, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(text, std::forward<Args>(args)...);
+        shader_source += '\n';
+    }
+
+    std::string AllocLongVectorTemporary() {
+        max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1);
+        return fmt::format("L{}", num_long_temporaries++);
+    }
+
+    std::string AllocLongTemporary() {
+        return fmt::format("{}.x", AllocLongVectorTemporary());
+    }
+
+    std::string AllocVectorTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}", num_temporaries++);
+    }
+
+    std::string AllocTemporary() {
+        return fmt::format("{}.x", AllocVectorTemporary());
+    }
+
+    void ResetTemporaries() noexcept {
+        num_temporaries = 0;
+        num_long_temporaries = 0;
+    }
+
+    const Device& device;
+    const ShaderIR& ir;
+    const Registry& registry;
+    const ShaderType stage;
+
+    std::size_t num_temporaries = 0;
+    std::size_t max_temporaries = 0;
+
+    std::size_t num_long_temporaries = 0;
+    std::size_t max_long_temporaries = 0;
+
+    std::map<GlobalMemoryBase, u32> global_memory_names;
+
+    std::string shader_source;
+
+    static constexpr std::string_view ADD_F32 = "ADD.F32";
+    static constexpr std::string_view ADD_S = "ADD.S";
+    static constexpr std::string_view ADD_U = "ADD.U";
+    static constexpr std::string_view MUL_F32 = "MUL.F32";
+    static constexpr std::string_view MUL_S = "MUL.S";
+    static constexpr std::string_view MUL_U = "MUL.U";
+    static constexpr std::string_view DIV_F32 = "DIV.F32";
+    static constexpr std::string_view DIV_S = "DIV.S";
+    static constexpr std::string_view DIV_U = "DIV.U";
+    static constexpr std::string_view MAD_F32 = "MAD.F32";
+    static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+    static constexpr std::string_view COS_F32 = "COS.F32";
+    static constexpr std::string_view SIN_F32 = "SIN.F32";
+    static constexpr std::string_view EX2_F32 = "EX2.F32";
+    static constexpr std::string_view LG2_F32 = "LG2.F32";
+    static constexpr std::string_view SLT_F = "SLT.F32";
+    static constexpr std::string_view SLT_S = "SLT.S";
+    static constexpr std::string_view SLT_U = "SLT.U";
+    static constexpr std::string_view SEQ_F = "SEQ.F32";
+    static constexpr std::string_view SEQ_S = "SEQ.S";
+    static constexpr std::string_view SEQ_U = "SEQ.U";
+    static constexpr std::string_view SLE_F = "SLE.F32";
+    static constexpr std::string_view SLE_S = "SLE.S";
+    static constexpr std::string_view SLE_U = "SLE.U";
+    static constexpr std::string_view SGT_F = "SGT.F32";
+    static constexpr std::string_view SGT_S = "SGT.S";
+    static constexpr std::string_view SGT_U = "SGT.U";
+    static constexpr std::string_view SNE_F = "SNE.F32";
+    static constexpr std::string_view SNE_S = "SNE.S";
+    static constexpr std::string_view SNE_U = "SNE.U";
+    static constexpr std::string_view SGE_F = "SGE.F32";
+    static constexpr std::string_view SGE_S = "SGE.S";
+    static constexpr std::string_view SGE_U = "SGE.U";
+    static constexpr std::string_view AND_S = "AND.S";
+    static constexpr std::string_view AND_U = "AND.U";
+    static constexpr std::string_view TRUNC_F = "TRUNC.F";
+    static constexpr std::string_view TRUNC_S = "TRUNC.S";
+    static constexpr std::string_view TRUNC_U = "TRUNC.U";
+    static constexpr std::string_view SHL_S = "SHL.S";
+    static constexpr std::string_view SHL_U = "SHL.U";
+    static constexpr std::string_view SHR_S = "SHR.S";
+    static constexpr std::string_view SHR_U = "SHR.U";
+    static constexpr std::string_view OR_S = "OR.S";
+    static constexpr std::string_view OR_U = "OR.U";
+    static constexpr std::string_view XOR_S = "XOR.S";
+    static constexpr std::string_view XOR_U = "XOR.U";
+    static constexpr std::string_view NOT_S = "NOT.S";
+    static constexpr std::string_view NOT_U = "NOT.U";
+    static constexpr std::string_view BTC_S = "BTC.S";
+    static constexpr std::string_view BTC_U = "BTC.U";
+    static constexpr std::string_view BTFM_S = "BTFM.S";
+    static constexpr std::string_view BTFM_U = "BTFM.U";
+    static constexpr std::string_view ROUND_F = "ROUND.F";
+    static constexpr std::string_view CEIL_F = "CEIL.F";
+    static constexpr std::string_view FLR_F = "FLR.F";
+    static constexpr std::string_view I2F_S = "I2F.S";
+    static constexpr std::string_view I2F_U = "I2F.U";
+    static constexpr std::string_view MIN_F = "MIN.F";
+    static constexpr std::string_view MIN_S = "MIN.S";
+    static constexpr std::string_view MIN_U = "MIN.U";
+    static constexpr std::string_view MAX_F = "MAX.F";
+    static constexpr std::string_view MAX_S = "MAX.S";
+    static constexpr std::string_view MAX_U = "MAX.U";
+    static constexpr std::string_view MOV_U = "MOV.U";
+    static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+    static constexpr std::string_view TGALL_U = "TGALL.U";
+    static constexpr std::string_view TGANY_U = "TGANY.U";
+    static constexpr std::string_view TGEQ_U = "TGEQ.U";
+    static constexpr std::string_view EXCH = "EXCH";
+    static constexpr std::string_view ADD = "ADD";
+    static constexpr std::string_view MIN = "MIN";
+    static constexpr std::string_view MAX = "MAX";
+    static constexpr std::string_view AND = "AND";
+    static constexpr std::string_view OR = "OR";
+    static constexpr std::string_view XOR = "XOR";
+    static constexpr std::string_view U32 = "U32";
+    static constexpr std::string_view S32 = "S32";
+
+    static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+    using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+    static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+        &ARBDecompiler::Assign,
+
+        &ARBDecompiler::Select,
+
+        &ARBDecompiler::Binary<ADD_F32>,
+        &ARBDecompiler::Binary<MUL_F32>,
+        &ARBDecompiler::Binary<DIV_F32>,
+        &ARBDecompiler::Trinary<MAD_F32>,
+        &ARBDecompiler::Negate<'F'>,
+        &ARBDecompiler::Absolute<'F'>,
+        &ARBDecompiler::FClamp,
+        &ARBDecompiler::FCastHalf0,
+        &ARBDecompiler::FCastHalf1,
+        &ARBDecompiler::Binary<MIN_F>,
+        &ARBDecompiler::Binary<MAX_F>,
+        &ARBDecompiler::Unary<COS_F32>,
+        &ARBDecompiler::Unary<SIN_F32>,
+        &ARBDecompiler::Unary<EX2_F32>,
+        &ARBDecompiler::Unary<LG2_F32>,
+        &ARBDecompiler::Unary<RSQ_F32>,
+        &ARBDecompiler::FSqrt,
+        &ARBDecompiler::Unary<ROUND_F>,
+        &ARBDecompiler::Unary<FLR_F>,
+        &ARBDecompiler::Unary<CEIL_F>,
+        &ARBDecompiler::Unary<TRUNC_F>,
+        &ARBDecompiler::Unary<I2F_S>,
+        &ARBDecompiler::Unary<I2F_U>,
+        &ARBDecompiler::FSwizzleAdd,
+
+        &ARBDecompiler::Binary<ADD_S>,
+        &ARBDecompiler::Binary<MUL_S>,
+        &ARBDecompiler::Binary<DIV_S>,
+        &ARBDecompiler::Negate<'S'>,
+        &ARBDecompiler::Absolute<'S'>,
+        &ARBDecompiler::Binary<MIN_S>,
+        &ARBDecompiler::Binary<MAX_S>,
+
+        &ARBDecompiler::Unary<TRUNC_S>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_S>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_S>,
+        &ARBDecompiler::Binary<AND_S>,
+        &ARBDecompiler::Binary<OR_S>,
+        &ARBDecompiler::Binary<XOR_S>,
+        &ARBDecompiler::Unary<NOT_S>,
+        &ARBDecompiler::BitfieldInsert<'S'>,
+        &ARBDecompiler::BitfieldExtract<'S'>,
+        &ARBDecompiler::Unary<BTC_S>,
+        &ARBDecompiler::Unary<BTFM_S>,
+
+        &ARBDecompiler::Binary<ADD_U>,
+        &ARBDecompiler::Binary<MUL_U>,
+        &ARBDecompiler::Binary<DIV_U>,
+        &ARBDecompiler::Binary<MIN_U>,
+        &ARBDecompiler::Binary<MAX_U>,
+        &ARBDecompiler::Unary<TRUNC_U>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::BitfieldInsert<'U'>,
+        &ARBDecompiler::BitfieldExtract<'U'>,
+        &ARBDecompiler::Unary<BTC_U>,
+        &ARBDecompiler::Unary<BTFM_U>,
+
+        &ARBDecompiler::HAdd2,
+        &ARBDecompiler::HMul2,
+        &ARBDecompiler::HFma2,
+        &ARBDecompiler::HAbsolute,
+        &ARBDecompiler::HNegate,
+        &ARBDecompiler::HClamp,
+        &ARBDecompiler::HCastFloat,
+        &ARBDecompiler::HUnpack,
+        &ARBDecompiler::HMergeF32,
+        &ARBDecompiler::HMergeH0,
+        &ARBDecompiler::HMergeH1,
+        &ARBDecompiler::HPack2,
+
+        &ARBDecompiler::LogicalAssign,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::LogicalPick2,
+        &ARBDecompiler::LogicalAnd2,
+
+        &ARBDecompiler::FloatComparison<SLT_F, false>,
+        &ARBDecompiler::FloatComparison<SEQ_F, false>,
+        &ARBDecompiler::FloatComparison<SLE_F, false>,
+        &ARBDecompiler::FloatComparison<SGT_F, false>,
+        &ARBDecompiler::FloatComparison<SNE_F, false>,
+        &ARBDecompiler::FloatComparison<SGE_F, false>,
+        &ARBDecompiler::FloatOrdered,
+        &ARBDecompiler::FloatUnordered,
+        &ARBDecompiler::FloatComparison<SLT_F, true>,
+        &ARBDecompiler::FloatComparison<SEQ_F, true>,
+        &ARBDecompiler::FloatComparison<SLE_F, true>,
+        &ARBDecompiler::FloatComparison<SGT_F, true>,
+        &ARBDecompiler::FloatComparison<SNE_F, true>,
+        &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+        &ARBDecompiler::Binary<SLT_S>,
+        &ARBDecompiler::Binary<SEQ_S>,
+        &ARBDecompiler::Binary<SLE_S>,
+        &ARBDecompiler::Binary<SGT_S>,
+        &ARBDecompiler::Binary<SNE_S>,
+        &ARBDecompiler::Binary<SGE_S>,
+
+        &ARBDecompiler::Binary<SLT_U>,
+        &ARBDecompiler::Binary<SEQ_U>,
+        &ARBDecompiler::Binary<SLE_U>,
+        &ARBDecompiler::Binary<SGT_U>,
+        &ARBDecompiler::Binary<SNE_U>,
+        &ARBDecompiler::Binary<SGE_U>,
+
+        &ARBDecompiler::LogicalAddCarry,
+
+        &ARBDecompiler::HalfComparison<SLT_F, false>,
+        &ARBDecompiler::HalfComparison<SEQ_F, false>,
+        &ARBDecompiler::HalfComparison<SLE_F, false>,
+        &ARBDecompiler::HalfComparison<SGT_F, false>,
+        &ARBDecompiler::HalfComparison<SNE_F, false>,
+        &ARBDecompiler::HalfComparison<SGE_F, false>,
+        &ARBDecompiler::HalfComparison<SLT_F, true>,
+        &ARBDecompiler::HalfComparison<SEQ_F, true>,
+        &ARBDecompiler::HalfComparison<SLE_F, true>,
+        &ARBDecompiler::HalfComparison<SGT_F, true>,
+        &ARBDecompiler::HalfComparison<SNE_F, true>,
+        &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::TextureGather,
+        &ARBDecompiler::TextureQueryDimensions,
+        &ARBDecompiler::TextureQueryLod,
+        &ARBDecompiler::TexelFetch,
+        &ARBDecompiler::TextureGradient,
+
+        &ARBDecompiler::ImageLoad,
+        &ARBDecompiler::ImageStore,
+
+        &ARBDecompiler::AtomicImage<ADD, U32>,
+        &ARBDecompiler::AtomicImage<AND, U32>,
+        &ARBDecompiler::AtomicImage<OR, U32>,
+        &ARBDecompiler::AtomicImage<XOR, U32>,
+        &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, U32>,
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, S32>,
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Branch,
+        &ARBDecompiler::BranchIndirect,
+        &ARBDecompiler::PushFlowStack,
+        &ARBDecompiler::PopFlowStack,
+        &ARBDecompiler::Exit,
+        &ARBDecompiler::Discard,
+
+        &ARBDecompiler::EmitVertex,
+        &ARBDecompiler::EndPrimitive,
+
+        &ARBDecompiler::InvocationId,
+        &ARBDecompiler::YNegate,
+        &ARBDecompiler::LocalInvocationId<'x'>,
+        &ARBDecompiler::LocalInvocationId<'y'>,
+        &ARBDecompiler::LocalInvocationId<'z'>,
+        &ARBDecompiler::WorkGroupId<'x'>,
+        &ARBDecompiler::WorkGroupId<'y'>,
+        &ARBDecompiler::WorkGroupId<'z'>,
+
+        &ARBDecompiler::Unary<TGBALLOT_U>,
+        &ARBDecompiler::Unary<TGALL_U>,
+        &ARBDecompiler::Unary<TGANY_U>,
+        &ARBDecompiler::Unary<TGEQ_U>,
+
+        &ARBDecompiler::ThreadId,
+        &ARBDecompiler::ThreadMask<'e', 'q'>,
+        &ARBDecompiler::ThreadMask<'g', 'e'>,
+        &ARBDecompiler::ThreadMask<'g', 't'>,
+        &ARBDecompiler::ThreadMask<'l', 'e'>,
+        &ARBDecompiler::ThreadMask<'l', 't'>,
+        &ARBDecompiler::ShuffleIndexed,
+
+        &ARBDecompiler::Barrier,
+        &ARBDecompiler::MemoryBarrierGroup,
+        &ARBDecompiler::MemoryBarrierGlobal,
+    };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                             ShaderType stage, std::string_view identifier)
+    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+    DefineGlobalMemory();
+
+    AddLine("TEMP RC;");
+    AddLine("TEMP FSWZA[4];");
+    AddLine("TEMP FSWZB[4];");
+    if (ir.IsDecompiled()) {
+        DecompileAST();
+    } else {
+        DecompileBranchMode();
+    }
+    AddLine("END");
+
+    const std::string code = std::move(shader_source);
+    DeclareHeader();
+    DeclareVertex();
+    DeclareGeometry();
+    DeclareFragment();
+    DeclareCompute();
+    DeclareInputAttributes();
+    DeclareOutputAttributes();
+    DeclareLocalMemory();
+    DeclareGlobalMemory();
+    DeclareConstantBuffers();
+    DeclareRegisters();
+    DeclareTemporaries();
+    DeclarePredicates();
+    DeclareInternalFlags();
+
+    shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+        return "vp";
+    case ShaderType::Geometry:
+        return "gp";
+    case ShaderType::Fragment:
+        return "fp";
+    case ShaderType::Compute:
+        return "cp";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+void ARBDecompiler::DefineGlobalMemory() {
+    u32 binding = 0;
+    for (const auto& pair : ir.GetGlobalMemory()) {
+        const GlobalMemoryBase base = pair.first;
+        global_memory_names.emplace(base, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareHeader() {
+    AddLine("!!NV{}5.0", HeaderStageName(stage));
+    // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+    AddLine("OPTION NV_internal;");
+    AddLine("OPTION NV_gpu_program_fp64;");
+    AddLine("OPTION NV_shader_thread_group;");
+    if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+        AddLine("OPTION NV_shader_thread_shuffle;");
+    }
+    if (stage == ShaderType::Vertex) {
+        if (device.HasNvViewportArray2()) {
+            AddLine("OPTION NV_viewport_array2;");
+        }
+    }
+    if (stage == ShaderType::Fragment) {
+        AddLine("OPTION ARB_draw_buffers;");
+    }
+    if (device.HasImageLoadFormatted()) {
+        AddLine("OPTION EXT_shader_image_load_formatted;");
+    }
+}
+
+void ARBDecompiler::DeclareVertex() {
+    if (stage != ShaderType::Vertex) {
+        return;
+    }
+    AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+    if (stage != ShaderType::Geometry) {
+        return;
+    }
+    const auto& info = registry.GetGraphicsInfo();
+    const auto& header = ir.GetHeader();
+    AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+    AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+    AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+    AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+    if (stage != ShaderType::Fragment) {
+        return;
+    }
+    AddLine("OUTPUT result_color7 = result.color[7];");
+    AddLine("OUTPUT result_color6 = result.color[6];");
+    AddLine("OUTPUT result_color5 = result.color[5];");
+    AddLine("OUTPUT result_color4 = result.color[4];");
+    AddLine("OUTPUT result_color3 = result.color[3];");
+    AddLine("OUTPUT result_color2 = result.color[2];");
+    AddLine("OUTPUT result_color1 = result.color[1];");
+    AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+    if (stage != ShaderType::Compute) {
+        return;
+    }
+    const ComputeInfo& info = registry.GetComputeInfo();
+    AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+            info.workgroup_size[2]);
+    if (info.shared_memory_size_in_words == 0) {
+        return;
+    }
+    const u32 limit = device.GetMaxComputeSharedMemorySize();
+    u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+    if (size_in_bytes > limit) {
+        LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}",
+                  size_in_bytes, limit);
+        size_in_bytes = limit;
+    }
+
+    AddLine("SHARED_MEMORY {};", size_in_bytes);
+    AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    const std::string_view stage_name = StageInputName(stage);
+    for (const auto attribute : ir.GetInputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+
+        std::string_view suffix;
+        if (stage == ShaderType::Fragment) {
+            const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+            if (input_mode == PixelImap::Unused) {
+                return;
+            }
+            suffix = GetInputFlags(input_mode);
+        }
+        AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+                index);
+    }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+    }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+    u64 size = 0;
+    if (stage == ShaderType::Compute) {
+        size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+    } else {
+        size = ir.GetHeader().GetLocalMemorySize();
+    }
+    if (size == 0) {
+        return;
+    }
+    const u64 element_count = Common::AlignUp(size, 4) / 4;
+    AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+    const size_t num_entries = ir.GetGlobalMemory().size();
+    if (num_entries > 0) {
+        AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1);
+    }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+    u32 binding = 0;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("TEMP R{};", gpr);
+    }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+    for (std::size_t i = 0; i < max_temporaries; ++i) {
+        AddLine("TEMP T{};", i);
+    }
+    for (std::size_t i = 0; i < max_long_temporaries; ++i) {
+        AddLine("LONG TEMP L{};", i);
+    }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("TEMP P{};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+    for (const char* name : INTERNAL_FLAG_NAMES) {
+        AddLine("TEMP {};", name);
+    }
+}
+
+void ARBDecompiler::InitializeVariables() {
+    AddLine("MOV.F32 FSWZA[0], -1;");
+    AddLine("MOV.F32 FSWZA[1], 1;");
+    AddLine("MOV.F32 FSWZA[2], -1;");
+    AddLine("MOV.F32 FSWZA[3], 0;");
+    AddLine("MOV.F32 FSWZB[0], -1;");
+    AddLine("MOV.F32 FSWZB[1], -1;");
+    AddLine("MOV.F32 FSWZB[2], 1;");
+    AddLine("MOV.F32 FSWZB[3], -1;");
+
+    if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+        AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+    }
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+    }
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DecompileAST() {
+    const u32 num_flow_variables = ir.GetASTNumVariables();
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("TEMP F{};", i);
+    }
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+    }
+
+    InitializeVariables();
+
+    VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+    static constexpr u32 FLOW_STACK_SIZE = 20;
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP SSY_TOP;");
+        AddLine("TEMP PBK_TOP;");
+    }
+
+    AddLine("TEMP PC;");
+
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("MOV.U SSY_TOP.x, 0;");
+        AddLine("MOV.U PBK_TOP.x, 0;");
+    }
+
+    InitializeVariables();
+
+    const auto basic_block_end = ir.GetBasicBlocks().end();
+    auto basic_block_it = ir.GetBasicBlocks().begin();
+    const u32 first_address = basic_block_it->first;
+    AddLine("MOV.U PC.x, {};", first_address);
+
+    AddLine("REP;");
+
+    std::size_t num_blocks = 0;
+    while (basic_block_it != basic_block_end) {
+        const auto& [address, bb] = *basic_block_it;
+        ++num_blocks;
+
+        AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+        AddLine("IF NE.x;");
+
+        VisitBlock(bb);
+
+        ++basic_block_it;
+
+        if (basic_block_it != basic_block_end) {
+            const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+            if (!op || op->GetCode() != OperationCode::Branch) {
+                const u32 next_address = basic_block_it->first;
+                AddLine("MOV.U PC.x, {};", next_address);
+                AddLine("CONT;");
+            }
+        }
+
+        AddLine("ELSE;");
+    }
+    AddLine("RET;");
+    while (num_blocks--) {
+        AddLine("ENDIF;");
+    }
+
+    AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+    if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("IF NE.x;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("ENDIF;");
+    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+        AddLine("ELSE;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(ast->nodes);
+    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+        ResetTemporaries();
+    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+        AddLine("REP;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("BRK (NE.x);");
+        AddLine("ENDREP;");
+    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast->condition);
+        if (!is_true) {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("IF NE.x;");
+            ResetTemporaries();
+        }
+        if (ast->kills) {
+            AddLine("KIL TR;");
+        } else {
+            Exit();
+        }
+        if (!is_true) {
+            AddLine("ENDIF;");
+        }
+    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast->condition)) {
+            AddLine("BRK;");
+        } else {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("BRK (NE.x);");
+            ResetTemporaries();
+        }
+    } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+        // Nothing to do
+    } else {
+        UNREACHABLE();
+    }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+    if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprOr>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprNot>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+        return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+    }
+    if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+        return Visit(ir.GetConditionCode(expr->cc));
+    }
+    if (const auto expr = std::get_if<ExprVar>(&*node)) {
+        return fmt::format("F{}.x", expr->var_index);
+    }
+    if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+        return expr->value ? "0xffffffff" : "0";
+    }
+    if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+        return result;
+    }
+    UNREACHABLE();
+    return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+    for (const auto& node : bb) {
+        Visit(node);
+    }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        if (const auto amend_index = operation->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+        if (index >= OPERATION_DECOMPILERS.size()) {
+            UNREACHABLE_MSG("Out of bounds operation: {}", index);
+            return {};
+        }
+        const auto decompiler = OPERATION_DECOMPILERS[index];
+        if (decompiler == nullptr) {
+            UNREACHABLE_MSG("Undefined operation: {}", index);
+            return {};
+        }
+        return (this->*decompiler)(*operation);
+    }
+
+    if (const auto gpr = std::get_if<GprNode>(&*node)) {
+        const u32 index = gpr->GetIndex();
+        if (index == Register::ZeroIndex) {
+            return "{0, 0, 0, 0}.x";
+        }
+        return fmt::format("R{}.x", index);
+    }
+
+    if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+        return fmt::format("CV{}.x", cv->GetIndex());
+    }
+
+    if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+        return temporary;
+    }
+
+    if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        switch (const auto index = predicate->GetIndex(); index) {
+        case Tegra::Shader::Pred::UnusedIndex:
+            AddLine("MOV.S {}, -1;", temporary);
+            break;
+        case Tegra::Shader::Pred::NeverExecute:
+            AddLine("MOV.S {}, 0;", temporary);
+            break;
+        default:
+            AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+            break;
+        }
+        if (predicate->IsNegated()) {
+            AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+        }
+        return temporary;
+    }
+
+    if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+        if (abuf->IsPhysicalBuffer()) {
+            UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+            return "{0, 0, 0, 0}.x";
+        }
+
+        const Attribute::Index index = abuf->GetIndex();
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (index) {
+        case Attribute::Index::Position: {
+            if (stage == ShaderType::Geometry) {
+                return fmt::format("{}_position[{}].{}", StageInputName(stage),
+                                   Visit(abuf->GetBuffer()), swizzle);
+            } else {
+                return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+            }
+        }
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            ASSERT(stage == ShaderType::Vertex);
+            switch (element) {
+            case 2:
+                return "vertex.instance";
+            case 3:
+                return "vertex.id";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            break;
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "fragment.pointcoord.x";
+            case 1:
+                return "fragment.pointcoord.y";
+            }
+            UNIMPLEMENTED();
+            break;
+        case Attribute::Index::FrontFacing: {
+            ASSERT(stage == ShaderType::Fragment);
+            ASSERT(element == 3);
+            const std::string temporary = AllocVectorTemporary();
+            AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+            AddLine("MOV.U.CC RC.x, -RC;");
+            AddLine("MOV.S {}.x, 0;", temporary);
+            AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+            return fmt::format("{}.x", temporary);
+        }
+        default:
+            if (IsGenericAttribute(index)) {
+                if (stage == ShaderType::Geometry) {
+                    return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+                                       Visit(abuf->GetBuffer()), swizzle);
+                } else {
+                    return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+                                       GetGenericAttributeIndex(index), swizzle);
+                }
+            }
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            break;
+        }
+        return "{0, 0, 0, 0}.x";
+    }
+
+    if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+        std::string offset_string;
+        const auto& offset = cbuf->GetOffset();
+        if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+            offset_string = std::to_string(imm->GetValue());
+        } else {
+            offset_string = Visit(offset);
+        }
+        std::string temporary = AllocTemporary();
+        AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+        return temporary;
+    }
+
+    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV {}, 0;", temporary);
+        AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem));
+        return temporary;
+    }
+
+    if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+        std::string temporary = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+        AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto smem = std::get_if<SmemNode>(&*node)) {
+        std::string temporary = Visit(smem->GetAddress());
+        AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    }
+
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        if (const auto amend_index = conditional->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+        AddLine("IF NE.x;");
+        VisitBlock(conditional->GetCode());
+        AddLine("ENDIF;");
+        return {};
+    }
+
+    if ([[maybe_unused]] const auto cmt = std::get_if<CommentNode>(&*node)) {
+        // Uncommenting this will generate invalid code. GLASM lacks comments.
+        // AddLine("// {}", cmt->GetText());
+        return {};
+    }
+
+    UNIMPLEMENTED();
+    return {};
+}
+
+std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+
+    const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
+                             meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
+    const std::size_t count = operation.GetOperandsCount();
+    std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    if (meta.sampler.is_array) {
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
+        ++i;
+    }
+    if (meta.sampler.is_shadow) {
+        std::string compare = Visit(meta.depth_compare);
+        if (is_extended) {
+            ASSERT(i == 4);
+            std::string extra_coord = AllocVectorTemporary();
+            AddLine("MOV.F {}.x, {};", extra_coord, compare);
+            return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
+        }
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
+        ++i;
+    }
+    return {temporary, temporary, i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    if (meta.aoffi.empty()) {
+        return {};
+    }
+    const std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (auto& node : meta.aoffi) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+    }
+    return fmt::format(", offset({})", temporary);
+}
+
+std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
+    // Read a bindless SSBO, return its address and set CC accordingly
+    // address = c[binding].xy
+    // length  = c[binding].z
+    const u32 binding = global_memory_names.at(gmem.GetDescriptor());
+
+    const std::string pointer = AllocLongVectorTemporary();
+    std::string temporary = AllocTemporary();
+
+    AddLine("PK64.U {}, c[{}];", pointer, binding);
+    AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
+            Visit(gmem.GetBaseAddress()));
+    AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
+    AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer);
+    // Compare offset to length and set CC
+    AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding);
+    return fmt::format("{}.x", pointer);
+}
+
+void ARBDecompiler::Exit() {
+    if (stage != ShaderType::Fragment) {
+        AddLine("RET;");
+        return;
+    }
+
+    const auto safe_get_register = [this](u32 reg) -> std::string {
+        // TODO(Rodrigo): Replace with contains once C++20 releases
+        const auto& used_registers = ir.GetRegisters();
+        if (used_registers.find(reg) != used_registers.end()) {
+            return fmt::format("R{}.x", reg);
+        }
+        return "{0, 0, 0, 0}.x";
+    };
+
+    const auto& header = ir.GetHeader();
+    u32 current_reg = 0;
+    for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        for (u32 component = 0; component < 4; ++component) {
+            if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                continue;
+            }
+            AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+                    safe_get_register(current_reg));
+            ++current_reg;
+        }
+    }
+    if (header.ps.omap.depth) {
+        AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+    }
+
+    AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string dest_name;
+    if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+        if (gpr->GetIndex() == Register::ZeroIndex) {
+            // Writing to Register::ZeroIndex is a no op
+            return {};
+        }
+        dest_name = fmt::format("R{}.x", gpr->GetIndex());
+    } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (const Attribute::Index index = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            dest_name = fmt::format("result.position.{}", swizzle);
+            break;
+        case Attribute::Index::LayerViewportPointSize:
+            switch (element) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+            case 2:
+                if (!device.HasNvViewportArray2()) {
+                    LOG_ERROR(
+                        Render_OpenGL,
+                        "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+                    return {};
+                }
+                dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+                break;
+            case 3:
+                dest_name = "result.pointsize.x";
+                break;
+            }
+            break;
+        case Attribute::Index::ClipDistances0123:
+            dest_name = fmt::format("result.clip[{}].x", element);
+            break;
+        case Attribute::Index::ClipDistances4567:
+            dest_name = fmt::format("result.clip[{}].x", element + 4);
+            break;
+        default:
+            if (!IsGenericAttribute(index)) {
+                UNREACHABLE();
+                return {};
+            }
+            dest_name =
+                fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+            break;
+        }
+    } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+        const std::string address = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", address, address);
+        dest_name = fmt::format("lmem[{}].x", address);
+    } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+        AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+        ResetTemporaries();
+        return {};
+    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        AddLine("IF NE.x;");
+        AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
+        AddLine("ENDIF;");
+        ResetTemporaries();
+        return {};
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", dest_name, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+            Visit(operation[2]));
+    return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+    // 1.0f in hex, replace with std::bit_cast on C++20
+    static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+    std::string temporary = AllocTemporary();
+    const Node& value = operation[0];
+    const Node& low = operation[1];
+    const Node& high = operation[2];
+    const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+    if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+        AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+    } else {
+        AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+        AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+    }
+    return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+    AddLine("MOV {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+    AddLine("RCP.F32 {}, {};", temporary, temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+    AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+    AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+    AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+    AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+    AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+    AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    const std::string tmp3 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+    AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+    AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+    AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+    AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+    AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+    std::string operand = Visit(operation[0]);
+    switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+    case Tegra::Shader::HalfType::H0_H1:
+        return operand;
+    case Tegra::Shader::HalfType::F32: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.U {}.x, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H0_H0: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H1_H1: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    }
+    UNREACHABLE();
+    return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string target;
+
+    if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+        ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+        const Tegra::Shader::Pred index = pred->GetIndex();
+        switch (index) {
+        case Tegra::Shader::Pred::NeverExecute:
+        case Tegra::Shader::Pred::UnusedIndex:
+            // Writing to these predicates is a no-op
+            return {};
+        }
+        target = fmt::format("P{}.x", static_cast<u64>(index));
+    } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", target, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+    AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const std::string op = Visit(operation[0]);
+    AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("MOV.S {} (NAN.x), 0;", temporary);
+    AddLine("MOV.S {} (NAN.y), 0;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("MOV.S {} (NAN.x), -1;", temporary);
+    AddLine("MOV.S {} (NAN.y), -1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("IF CF.x;");
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("ENDIF;");
+    return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+    std::string_view opcode = "TEX";
+    std::string extra;
+    if (meta.bias) {
+        ASSERT(!meta.lod);
+        opcode = "TXB";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+        } else {
+            const std::string bias = AllocTemporary();
+            AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+            extra = fmt::format(" {},", bias);
+        }
+    }
+    if (meta.lod) {
+        ASSERT(!meta.bias);
+        opcode = "TXL";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+        } else {
+            const std::string lod = AllocTemporary();
+            AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+            extra = fmt::format(" {},", lod);
+        }
+    }
+
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+    std::string comp;
+    if (!meta.sampler.is_shadow) {
+        const auto& immediate = std::get<ImmediateNode>(*meta.component);
+        comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+    }
+
+    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+    AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::size_t count = operation.GetOperandsCount();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+    AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+    AddLine("TRUNC.S {}, {};", temporary, temporary);
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+    if (!meta.sampler.is_buffer) {
+        ASSERT(swizzle < 4);
+        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+    }
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
+            BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const std::string ddx = AllocVectorTemporary();
+    const std::string ddy = AllocVectorTemporary();
+    const std::string coord = std::get<1>(BuildCoords(operation));
+
+    const std::size_t num_components = meta.derivates.size() / 2;
+    for (std::size_t index = 0; index < num_components; ++index) {
+        const char swizzle = Swizzle(index);
+        AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+        AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+    }
+
+    const std::string_view result = coord;
+    AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+    return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t count = operation.GetOperandsCount();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string temporary = AllocVectorTemporary();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+    AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t num_coords = operation.GetOperandsCount();
+    const std::size_t num_values = meta.values.size();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string coord = AllocVectorTemporary();
+    const std::string value = AllocVectorTemporary();
+    for (std::size_t i = 0; i < num_coords; ++i) {
+        AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+    }
+    for (std::size_t i = 0; i < num_values; ++i) {
+        AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+    }
+    AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+    return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+    const auto target = std::get<ImmediateNode>(*operation[0]);
+    AddLine("MOV.U PC.x, {};", target.GetValue());
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+    AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+    const std::string_view stack_name = StackName(stack);
+    AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+    AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const std::string_view stack_name = StackName(stack);
+    AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+    Exit();
+    return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+    AddLine("KIL TR;");
+    return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+    AddLine("EMIT;");
+    return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+    AddLine("ENDPRIM;");
+    return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+    return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+    LOG_WARNING(Render_OpenGL, "(STUBBED)");
+    std::string temporary = AllocTemporary();
+    AddLine("MOV.F {}, 1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+    return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        return Visit(operation[0]);
+    }
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+            Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+    AddLine("BAR;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+    AddLine("MEMBAR.CTA;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+    AddLine("MEMBAR;");
+    return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier) {
+    return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 4eb37a96c..b1c4cd62f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -21,22 +22,54 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
     : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
 
-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+    const GLintptr gl_offset = static_cast<GLintptr>(offset);
+    if (read_buffer.handle == 0) {
+        read_buffer.Create();
+        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+                          GL_STREAM_READ);
+    }
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
+
+OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                               Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, gpu_memory, cpu_memory,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -47,49 +80,21 @@ OGLBufferCache::~OGLBufferCache() {
     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }
 
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-void OGLBufferCache::WriteBarrier() {
-    glMemoryBarrier(GL_ALL_BARRIER_BITS);
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    static const GLuint null_buffer = 0;
-    return &null_buffer;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
-                             static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
-                             static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {&cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index d94a11252..f75b32e31 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
@@ -24,59 +23,59 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
 
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    void Download(std::size_t offset, std::size_t size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size);
 
-    const GLuint* GetHandle() const {
-        return &gl_buffer.handle;
+    GLuint Handle() const noexcept {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const noexcept {
+        return gpu_address;
     }
 
 private:
-    OGLBuffer gl_buffer{};
+    OGLBuffer gl_buffer;
+    OGLBuffer read_buffer;
+    u64 gpu_address = 0;
 };
 
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+    explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                            Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    const GLuint* GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
     }
 
 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    void WriteBarrier() override;
-
-    const GLuint* ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index c286502ba..a94e4f72e 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <optional>
 #include <vector>
 
@@ -13,6 +14,7 @@
 
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
+#include "core/settings.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
 
 constexpr u32 NumStages = 5;
 
-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
-                                  GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
-                                  GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};
 
 constexpr std::array LimitSSBOs = {
-    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
     GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
 
-constexpr std::array LimitSamplers = {
-    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
 
-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
-                                    GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};
 
 template <typename T>
 T GetInteger(GLenum pname) {
@@ -84,10 +89,17 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
     return std::exchange(base, base + amount);
 }
 
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
     std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
 
-    static std::array<std::size_t, 5> stage_swizzle = {0, 1, 2, 3, 4};
+    static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4};
     const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS);
     const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS);
     const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS);
@@ -111,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -132,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
 }
 
 bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
     static constexpr std::array formats = {
         GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
         GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -148,59 +169,94 @@ bool IsASTCSupported() {
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
     };
-    return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
-               GLint supported;
-               glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
-                                     &supported);
-               return supported == GL_TRUE;
-           }) == formats.end();
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(target, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
 }
 
 } // Anonymous namespace
 
-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }
 
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
     has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
                           GLAD_GL_NV_shader_thread_shuffle;
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
-    has_fast_buffer_sub_data = is_nvidia;
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
+                           GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
+                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
+
+    use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
     LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+    if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
+        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+    }
 }
 
 Device::Device(std::nullptr_t) {
-    uniform_buffer_alignment = 0;
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
     max_vertex_attributes = 16;
     max_varyings = 15;
+    max_compute_shared_memory_size = 0x10000;
     has_warp_intrinsics = true;
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
-    has_component_indexing_bug = false;
-    has_broken_compute = false;
-    has_precise_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index a55050cb5..8a4b6b9fc 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
     explicit Device();
     explicit Device(std::nullptr_t);
 
+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
     const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
         return base_bindings[stage_index];
     }
@@ -48,6 +52,10 @@ public:
         return max_varyings;
     }
 
+    u32 GetMaxComputeSharedMemorySize() const {
+        return max_compute_shared_memory_size;
+    }
+
     bool HasWarpIntrinsics() const {
         return has_warp_intrinsics;
     }
@@ -64,6 +72,14 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -80,33 +96,47 @@ public:
         return has_precise_bug;
     }
 
-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
     bool HasFastBufferSubData() const {
         return has_fast_buffer_sub_data;
     }
 
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
+    bool UseAssemblyShaders() const {
+        return use_assembly_shaders;
+    }
+
+    bool UseAsynchronousShaders() const {
+        return use_asynchronous_shaders;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
 
-    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
     std::size_t uniform_buffer_alignment{};
     std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
+    u32 max_compute_shared_memory_size{};
     bool has_warp_intrinsics{};
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
-    bool has_broken_compute{};
     bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
+    bool use_assembly_shaders{};
+    bool use_asynchronous_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
new file mode 100644
index 000000000..b532fdcc2
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -0,0 +1,73 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+
+#include <glad/glad.h>
+
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_fence_manager.h"
+
+namespace OpenGL {
+
+GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {}
+
+GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed)
+    : FenceBase(address, payload, is_stubbed) {}
+
+GLInnerFence::~GLInnerFence() = default;
+
+void GLInnerFence::Queue() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(sync_object.handle == 0);
+    sync_object.Create();
+}
+
+bool GLInnerFence::IsSignaled() const {
+    if (is_stubbed) {
+        return true;
+    }
+    ASSERT(sync_object.handle != 0);
+    GLsizei length;
+    GLint sync_status;
+    glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status);
+    return sync_status == GL_SIGNALED;
+}
+
+void GLInnerFence::Wait() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(sync_object.handle != 0);
+    glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
+}
+
+FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                                       TextureCacheOpenGL& texture_cache,
+                                       OGLBufferCache& buffer_cache, QueryCache& query_cache)
+    : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {}
+
+Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
+    return std::make_shared<GLInnerFence>(value, is_stubbed);
+}
+
+Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) {
+    return std::make_shared<GLInnerFence>(addr, value, is_stubbed);
+}
+
+void FenceManagerOpenGL::QueueFence(Fence& fence) {
+    fence->Queue();
+}
+
+bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const {
+    return fence->IsSignaled();
+}
+
+void FenceManagerOpenGL::WaitFence(Fence& fence) {
+    fence->Wait();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h
new file mode 100644
index 000000000..da1dcdace
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -0,0 +1,52 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+#include "common/common_types.h"
+#include "video_core/fence_manager.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_texture_cache.h"
+
+namespace OpenGL {
+
+class GLInnerFence : public VideoCommon::FenceBase {
+public:
+    GLInnerFence(u32 payload, bool is_stubbed);
+    GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed);
+    ~GLInnerFence();
+
+    void Queue();
+
+    bool IsSignaled() const;
+
+    void Wait();
+
+private:
+    OGLSync sync_object;
+};
+
+using Fence = std::shared_ptr<GLInnerFence>;
+using GenericFenceManager =
+    VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>;
+
+class FenceManagerOpenGL final : public GenericFenceManager {
+public:
+    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                                TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
+                                QueryCache& query_cache);
+
+protected:
+    Fence CreateFence(u32 value, bool is_stubbed) override;
+    Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override;
+    void QueueFence(Fence& fence) override;
+    bool IsFenceSignaled(Fence& fence) const override;
+    void WaitFence(Fence& fence) override;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
index f12e9f55f..1a3d9720e 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -30,12 +30,11 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
 
 } // Anonymous namespace
 
-QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
-    : VideoCommon::QueryCacheBase<
-          QueryCache, CachedQuery, CounterStream, HostCounter,
-          std::vector<OGLQuery>>{system,
-                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
-      gl_rasterizer{gl_rasterizer} {}
+QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+                       Tegra::MemoryManager& gpu_memory)
+    : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>(
+          rasterizer, maxwell3d, gpu_memory),
+      gl_rasterizer{rasterizer} {}
 
 QueryCache::~QueryCache() = default;
 
@@ -90,13 +89,15 @@ u64 HostCounter::BlockingQuery() const {
 CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
     : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
 
+CachedQuery::~CachedQuery() = default;
+
 CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
     : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
 
 CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
-    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
     cache = rhs.cache;
     type = rhs.type;
+    CachedQueryBase<HostCounter>::operator=(std::move(rhs));
     return *this;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
index d8e7052a1..82cac51ee 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.h
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -26,10 +26,11 @@ class RasterizerOpenGL;
 
 using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
 
-class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
-                                                            HostCounter, std::vector<OGLQuery>> {
+class QueryCache final
+    : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
 public:
-    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+                        Tegra::MemoryManager& gpu_memory);
     ~QueryCache();
 
     OGLQuery AllocateQuery(VideoCore::QueryType type);
@@ -40,6 +41,7 @@ public:
 
 private:
     RasterizerOpenGL& gl_rasterizer;
+    std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools;
 };
 
 class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
@@ -62,10 +64,12 @@ class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
 public:
     explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
                          u8* host_ptr);
-    CachedQuery(CachedQuery&& rhs) noexcept;
-    CachedQuery(const CachedQuery&) = delete;
+    ~CachedQuery() override;
 
+    CachedQuery(CachedQuery&& rhs) noexcept;
     CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+
+    CachedQuery(const CachedQuery&) = delete;
     CachedQuery& operator=(const CachedQuery&) = delete;
 
     void Flush() override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f4598fbf7..cfddbde5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -54,19 +55,36 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
 
 namespace {
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                ShaderType shader_type, std::size_t index = 0) {
-    if (entry.IsBindless()) {
-        const Tegra::Texture::TextureHandle tex_handle =
-            engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
-        return engine.GetTextureInfo(tex_handle);
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
+    if (entry.is_bindless) {
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
     }
+
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
-    const u32 offset =
-        entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
+    const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
         return engine.GetStageTexture(shader_type, offset);
     } else {
@@ -89,23 +107,84 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
     return buffer.size;
 }
 
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
 
+void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
+    if (num_ssbos == 0) {
+        return;
+    }
+    glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
+                                    reinterpret_cast<const GLuint*>(ssbos));
+}
+
 } // Anonymous namespace
 
-RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                                   StateTracker& state_tracker)
-    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
-      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
-      screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
+                                   Core::Memory::Memory& cpu_memory, const Device& device_,
+                                   ScreenInfo& screen_info_, ProgramManager& program_manager_,
+                                   StateTracker& state_tracker_)
+    : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
+      kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
+      screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
+      texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker),
+      shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device),
+      query_cache(*this, maxwell3d, gpu_memory),
+      buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE),
+      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
+      async_shaders(emu_window) {
     CheckExtensions();
+
+    unified_uniform_buffer.Create();
+    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
+    if (device.UseAssemblyShaders()) {
+        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+        for (const GLuint cbuf : staging_cbufs) {
+            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
+                                 nullptr, 0);
+        }
+    }
+
+    if (device.UseAsynchronousShaders()) {
+        async_shaders.AllocateWorkers();
+    }
 }
 
-RasterizerOpenGL::~RasterizerOpenGL() {}
+RasterizerOpenGL::~RasterizerOpenGL() {
+    if (device.UseAssemblyShaders()) {
+        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+    }
+}
 
 void RasterizerOpenGL::CheckExtensions() {
     if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -116,8 +195,7 @@ void RasterizerOpenGL::CheckExtensions() {
 }
 
 void RasterizerOpenGL::SetupVertexFormat() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexFormats]) {
         return;
     }
@@ -131,13 +209,13 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
         flags[Dirty::VertexFormat0 + index] = false;
 
-        const auto attrib = gpu.regs.vertex_attrib_format[index];
+        const auto attrib = maxwell3d.regs.vertex_attrib_format[index];
         const auto gl_index = static_cast<GLuint>(index);
 
         // Disable constant attributes.
@@ -150,9 +228,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
@@ -160,8 +239,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
 }
 
 void RasterizerOpenGL::SetupVertexBuffer() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexBuffers]) {
         return;
     }
@@ -169,9 +247,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
-    const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    const auto& regs = maxwell3d.regs;
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -184,27 +264,37 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
-        ASSERT(end > start);
-        const u64 size = end - start + 1;
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-
-        // Bind the vertex array to the buffer at the current offset.
-        vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer,
-                                                vertex_buffer_offset, vertex_array.stride);
+        ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
+        const u64 size = end - start;
+        if (size == 0) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
+            continue;
+        }
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
 void RasterizerOpenGL::SetupVertexInstances() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::VertexInstances]) {
         return;
     }
     flags[Dirty::VertexInstances] = false;
 
-    const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    const auto& regs = maxwell3d.regs;
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -219,24 +309,23 @@ void RasterizerOpenGL::SetupVertexInstances() {
 
 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    vertex_array_pushbuffer.SetIndexBuffer(buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     MICROPROFILE_SCOPE(OpenGL_Shader);
-    auto& gpu = system.GPU().Maxwell3D();
     u32 clip_distances = 0;
 
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
-        const auto& shader_config = gpu.regs.shader_config[index];
+        const auto& shader_config = maxwell3d.regs.shader_config[index];
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             switch (program) {
             case Maxwell::ShaderProgram::Geometry:
                 program_manager.UseGeometryShader(0);
@@ -251,23 +340,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         }
 
         // Currently this stages are not supported in the OpenGL backend.
-        // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
-        if (program == Maxwell::ShaderProgram::TesselationControl) {
-            continue;
-        } else if (program == Maxwell::ShaderProgram::TesselationEval) {
+        // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
+        if (program == Maxwell::ShaderProgram::TesselationControl ||
+            program == Maxwell::ShaderProgram::TesselationEval) {
             continue;
         }
 
-        Shader shader{shader_cache.GetStageProgram(program)};
+        Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
 
-        // Stage indices are 0 - 5
-        const std::size_t stage = index == 0 ? 0 : index - 1;
-        SetupDrawConstBuffers(stage, shader);
-        SetupDrawGlobalMemory(stage, shader);
-        SetupDrawTextures(stage, shader);
-        SetupDrawImages(stage, shader);
-
-        const GLuint program_handle = shader->GetHandle();
+        const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
         switch (program) {
         case Maxwell::ShaderProgram::VertexA:
         case Maxwell::ShaderProgram::VertexB:
@@ -284,6 +365,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
                               shader_config.enable.Value(), shader_config.offset);
         }
 
+        // Stage indices are 0 - 5
+        const std::size_t stage = index == 0 ? 0 : index - 1;
+        SetupDrawConstBuffers(stage, shader);
+        SetupDrawGlobalMemory(stage, shader);
+        SetupDrawTextures(stage, shader);
+        SetupDrawImages(stage, shader);
+
         // Workaround for Intel drivers.
         // When a clip distance is enabled but not set in the shader it crops parts of the screen
         // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -298,11 +386,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     }
 
     SyncClipEnabled(clip_distances);
-    gpu.dirty.flags[Dirty::Shaders] = false;
+    maxwell3d.dirty.flags[Dirty::Shaders] = false;
 }
 
 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -312,49 +400,42 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
         const GPUVAddr start = regs.vertex_array[index].StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
-        ASSERT(end > start);
-        size += end - start + 1;
+        size += end - start;
+        ASSERT(end >= start);
     }
 
     return size;
 }
 
 std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-
-    return static_cast<std::size_t>(regs.index_array.count) *
-           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+    return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
+           static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
 }
 
-void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
+void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
                                          const VideoCore::DiskResourceLoadCallback& callback) {
-    shader_cache.LoadDiskCache(stop_loading, callback);
-}
-
-void RasterizerOpenGL::SetupDirtyFlags() {
-    state_tracker.Initialize();
+    shader_cache.LoadDiskCache(title_id, stop_loading, callback);
 }
 
 void RasterizerOpenGL::ConfigureFramebuffers() {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    auto& gpu = system.GPU().Maxwell3D();
-    if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
+    if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
         return;
     }
-    gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
+    maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
 
     texture_cache.GuardRenderTargets(true);
 
-    View depth_surface = texture_cache.GetDepthBufferSurface();
+    View depth_surface = texture_cache.GetDepthBufferSurface(true);
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
 
     // Bind the framebuffer surfaces
     FramebufferCacheKey key;
     const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
     for (std::size_t index = 0; index < colors_count; ++index) {
-        View color_surface{texture_cache.GetColorBufferSurface(index)};
+        View color_surface{texture_cache.GetColorBufferSurface(index, true)};
         if (!color_surface) {
             continue;
         }
@@ -378,40 +459,62 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
     glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
 }
 
-void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb,
-                                                 bool using_stencil_fb) {
-    auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
+void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) {
+    const auto& regs = maxwell3d.regs;
 
     texture_cache.GuardRenderTargets(true);
     View color_surface;
-    if (using_color_fb) {
+
+    if (using_color) {
+        // Determine if we have to preserve the contents.
+        // First we have to make sure all clear masks are enabled.
+        bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G ||
+                                 !regs.clear_buffers.B || !regs.clear_buffers.A;
         const std::size_t index = regs.clear_buffers.RT;
-        color_surface = texture_cache.GetColorBufferSurface(index);
+        if (regs.clear_flags.scissor) {
+            // Then we have to confirm scissor testing clears the whole image.
+            const auto& scissor = regs.scissor_test[0];
+            preserve_contents |= scissor.min_x > 0;
+            preserve_contents |= scissor.min_y > 0;
+            preserve_contents |= scissor.max_x < regs.rt[index].width;
+            preserve_contents |= scissor.max_y < regs.rt[index].height;
+        }
+
+        color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents);
         texture_cache.MarkColorBufferInUse(index);
     }
+
     View depth_surface;
-    if (using_depth_fb || using_stencil_fb) {
-        depth_surface = texture_cache.GetDepthBufferSurface();
+    if (using_depth_stencil) {
+        bool preserve_contents = false;
+        if (regs.clear_flags.scissor) {
+            // For depth stencil clears we only have to confirm scissor test covers the whole image.
+            const auto& scissor = regs.scissor_test[0];
+            preserve_contents |= scissor.min_x > 0;
+            preserve_contents |= scissor.min_y > 0;
+            preserve_contents |= scissor.max_x < regs.zeta_width;
+            preserve_contents |= scissor.max_y < regs.zeta_height;
+        }
+
+        depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents);
         texture_cache.MarkDepthBufferInUse();
     }
     texture_cache.GuardRenderTargets(false);
 
     FramebufferCacheKey key;
-    key.colors[0] = color_surface;
-    key.zeta = depth_surface;
+    key.colors[0] = std::move(color_surface);
+    key.zeta = std::move(depth_surface);
 
     state_tracker.NotifyFramebuffer();
     glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
 }
 
 void RasterizerOpenGL::Clear() {
-    const auto& gpu = system.GPU().Maxwell3D();
-    if (!gpu.ShouldExecute()) {
+    if (!maxwell3d.ShouldExecute()) {
         return;
     }
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     bool use_color{};
     bool use_depth{};
     bool use_stencil{};
@@ -419,8 +522,7 @@ void RasterizerOpenGL::Clear() {
     if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
         regs.clear_buffers.A) {
         use_color = true;
-    }
-    if (use_color) {
+
         state_tracker.NotifyColorMask0();
         glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
                      regs.clear_buffers.B != 0, regs.clear_buffers.A != 0);
@@ -458,7 +560,7 @@ void RasterizerOpenGL::Clear() {
 
     UNIMPLEMENTED_IF(regs.clear_flags.viewport);
 
-    ConfigureClearFramebuffer(use_color, use_depth, use_stencil);
+    ConfigureClearFramebuffer(use_color, use_depth || use_stencil);
 
     if (use_color) {
         glClearBufferfv(GL_COLOR, 0, regs.clear_color);
@@ -477,7 +579,6 @@ void RasterizerOpenGL::Clear() {
 
 void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(OpenGL_Drawing);
-    auto& gpu = system.GPU().Maxwell3D();
 
     query_cache.UpdateCounters();
 
@@ -502,6 +603,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     SyncFramebufferSRGB();
 
     buffer_cache.Acquire();
+    current_cbuf = 0;
 
     std::size_t buffer_size = CalculateVertexArraysSize();
 
@@ -511,20 +613,28 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
-                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
-                      Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
+        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
     buffer_size += Maxwell::MaxConstBuffers *
                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
     // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = maxwell3d.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }
 
     // Prepare vertex array format.
     SetupVertexFormat();
-    vertex_array_pushbuffer.Setup();
 
     // Upload vertex and index data.
     SetupVertexBuffer();
@@ -534,21 +644,19 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
         index_buffer_offset = SetupIndexBuffer();
     }
 
-    // Prepare packed bindings.
-    bind_ubo_pushbuffer.Setup();
-    bind_ssbo_pushbuffer.Setup();
-
     // Setup emulation uniform buffer.
-    GLShader::MaxwellUniformData ubo;
-    ubo.SetFromRegs(gpu);
-    const auto [buffer, offset] =
-        buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-    bind_ubo_pushbuffer.Push(EmulationUniformBlockBinding, buffer, offset,
-                             static_cast<GLsizeiptr>(sizeof(ubo)));
+    if (!device.UseAssemblyShaders()) {
+        MaxwellUniformData ubo;
+        ubo.SetFromRegs(maxwell3d);
+        const auto info =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
+                          static_cast<GLsizeiptr>(sizeof(ubo)));
+    }
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
-    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
@@ -557,11 +665,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     // Signal the buffer cache that we are not going to upload more things.
     buffer_cache.Unmap();
 
-    // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
-    vertex_array_pushbuffer.Bind();
-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
     program_manager.BindGraphicsPipeline();
 
     if (texture_cache.TextureBarrier()) {
@@ -570,14 +673,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     BeginTransformFeedback(primitive_mode);
 
-    const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
+    const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance);
     const GLsizei num_instances =
-        static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
+        static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1);
     if (is_indexed) {
-        const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
-        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
+        const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
+        const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
         const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
-        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
         if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
             glDrawElements(primitive_mode, num_vertices, format, offset);
         } else if (num_instances == 1 && base_instance == 0) {
@@ -596,8 +699,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                                                           base_instance);
         }
     } else {
-        const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
-        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
+        const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first);
+        const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count);
         if (num_instances == 1 && base_instance == 0) {
             glDrawArrays(primitive_mode, base_vertex, num_vertices);
         } else if (base_instance == 0) {
@@ -611,37 +714,32 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     EndTransformFeedback();
 
     ++num_queued_commands;
+
+    gpu.TickWork();
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
     buffer_cache.Acquire();
+    current_cbuf = 0;
 
     auto kernel = shader_cache.GetComputeKernel(code_addr);
+    program_manager.BindCompute(kernel->GetHandle());
+
     SetupComputeTextures(kernel);
     SetupComputeImages(kernel);
-    program_manager.BindComputeShader(kernel->GetHandle());
 
     const std::size_t buffer_size =
         Tegra::Engines::KeplerCompute::NumConstBuffers *
         (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
     buffer_cache.Map(buffer_size);
 
-    bind_ubo_pushbuffer.Setup();
-    bind_ssbo_pushbuffer.Setup();
-
     SetupComputeConstBuffers(kernel);
     SetupComputeGlobalMemory(kernel);
 
     buffer_cache.Unmap();
 
-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
+    program_manager.BindCompute(kernel->GetHandle());
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
     ++num_queued_commands;
 }
@@ -667,6 +765,13 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
     query_cache.FlushRegion(addr, size);
 }
 
+bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
+    if (!Settings::IsGPULevelHigh()) {
+        return buffer_cache.MustFlushRegion(addr, size);
+    }
+    return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size);
+}
+
 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     if (addr == 0 || size == 0) {
@@ -678,13 +783,64 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
     query_cache.InvalidateRegion(addr, size);
 }
 
+void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
+    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    if (addr == 0 || size == 0) {
+        return;
+    }
+    texture_cache.OnCPUWrite(addr, size);
+    shader_cache.OnCPUWrite(addr, size);
+    buffer_cache.OnCPUWrite(addr, size);
+}
+
+void RasterizerOpenGL::SyncGuestHost() {
+    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    texture_cache.SyncGuestHost();
+    buffer_cache.SyncGuestHost();
+    shader_cache.SyncGuestHost();
+}
+
+void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
+    if (!gpu.IsAsync()) {
+        gpu_memory.Write<u32>(addr, value);
+        return;
+    }
+    fence_manager.SignalSemaphore(addr, value);
+}
+
+void RasterizerOpenGL::SignalSyncPoint(u32 value) {
+    if (!gpu.IsAsync()) {
+        gpu.IncrementSyncPoint(value);
+        return;
+    }
+    fence_manager.SignalSyncPoint(value);
+}
+
+void RasterizerOpenGL::ReleaseFences() {
+    if (!gpu.IsAsync()) {
+        return;
+    }
+    fence_manager.WaitPendingFences();
+}
+
 void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    if (Settings::values.use_accurate_gpu_emulation) {
+    if (Settings::IsGPULevelExtreme()) {
         FlushRegion(addr, size);
     }
     InvalidateRegion(addr, size);
 }
 
+void RasterizerOpenGL::WaitForIdle() {
+    // Place a barrier on everything that is not framebuffer related.
+    // This is related to another flag that is not currently implemented.
+    glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
+                    GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
+                    GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
+                    GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
+                    GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
+                    GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
+}
+
 void RasterizerOpenGL::FlushCommands() {
     // Only flush when we have commands queued to OpenGL.
     if (num_queued_commands == 0) {
@@ -739,40 +895,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
+    static constexpr std::array PARAMETER_LUT{
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+    };
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& stages = maxwell3d.state.shader_stages;
     const auto& shader_stage = stages[stage_index];
-
-    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetEntries().const_buffers) {
-        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+    const auto& entries = shader->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
+    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
+
+    const auto base_bindings = device.GetBaseBindings(stage_index);
+    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+    for (const auto& entry : entries.const_buffers) {
+        const u32 index = entry.GetIndex();
+        const auto& buffer = shader_stage.const_buffers[index];
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+                                           entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
+    const auto& entries = kernel->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
 
     u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().const_buffers) {
+    for (const auto& entry : entries.const_buffers) {
         const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
         const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
         Tegra::Engines::ConstBufferInfo buffer;
         buffer.address = config.Address();
         buffer.size = config.size;
         buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
-void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry) {
+void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
+                                        const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const ConstBufferEntry& entry, bool use_unified,
+                                        std::size_t unified_offset) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                                 sizeof(float));
+        if (device.UseAssemblyShaders()) {
+            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
+        } else {
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
+        }
         return;
     }
 
@@ -780,68 +968,112 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
     // UBO alignment requirements.
     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
-    const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                          device.HasFastBufferSubData());
-    bind_ubo_pushbuffer.Push(binding, cbuf, offset, size);
+    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+    const GPUVAddr gpu_addr = buffer.address;
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+    if (device.UseAssemblyShaders()) {
+        UNIMPLEMENTED_IF(use_unified);
+        if (info.offset != 0) {
+            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
+        }
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
+        return;
+    }
+
+    if (use_unified) {
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
+    }
 }
 
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
-    auto& gpu{system.GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
+    static constexpr std::array TARGET_LUT = {
+        GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
+        GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
+    };
+
+    const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
+    const auto& entries{shader->GetEntries().global_memory_entries};
 
-    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
-    for (const auto& entry : shader->GetEntries().global_memory_entries) {
-        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
-        const auto gpu_addr{memory_manager.Read<u64>(addr)};
-        const auto size{memory_manager.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding++, entry, gpu_addr, size);
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());
+
+    const bool assembly_shaders = device.UseAssemblyShaders();
+    u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
+    for (const auto& entry : entries) {
+        const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
+        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+        const u32 size{gpu_memory.Read<u32>(addr + 8)};
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
+        ++binding;
+    }
+    if (assembly_shaders) {
+        UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
     }
 }
 
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
-    auto& gpu{system.GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
+    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
+    const auto& entries{kernel->GetEntries().global_memory_entries};
+
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());
 
     u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().global_memory_entries) {
-        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
-        const auto gpu_addr{memory_manager.Read<u64>(addr)};
-        const auto size{memory_manager.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding++, entry, gpu_addr, size);
+    for (const auto& entry : entries) {
+        const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
+        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+        const u32 size{gpu_memory.Read<u32>(addr + 8)};
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
+        ++binding;
+    }
+    if (device.UseAssemblyShaders()) {
+        UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
     }
 }
 
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
-                                         GPUVAddr gpu_addr, std::size_t size) {
-    const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
-    bind_ssbo_pushbuffer.Push(binding, ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+                                         GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
+    const size_t alignment{device.GetShaderStorageBufferAlignment()};
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    if (device.UseAssemblyShaders()) {
+        *ssbo = BindlessSSBO{
+            .address = static_cast<GLuint64EXT>(info.address + info.offset),
+            .length = static_cast<GLsizei>(size),
+            .padding = 0,
+        };
+    } else {
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
+                          static_cast<GLsizeiptr>(size));
+    }
 }
 
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
-    const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
     for (const auto& entry : shader->GetEntries().samplers) {
         const auto shader_type = static_cast<ShaderType>(stage_index);
-        for (std::size_t i = 0; i < entry.Size(); ++i) {
+        for (std::size_t i = 0; i < entry.size; ++i) {
             const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
             SetupTexture(binding++, texture, entry);
         }
     }
 }
 
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
-    const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : kernel->GetEntries().samplers) {
-        for (std::size_t i = 0; i < entry.Size(); ++i) {
-            const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
+        for (std::size_t i = 0; i < entry.size; ++i) {
+            const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i);
             SetupTexture(binding++, texture, entry);
         }
     }
@@ -856,33 +1088,27 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
         glBindTextureUnit(binding, 0);
         return;
     }
-    glBindTextureUnit(binding, view->GetTexture());
-
-    if (view->GetSurfaceParams().IsBuffer()) {
-        return;
+    const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+                                           texture.tic.z_source, texture.tic.w_source);
+    glBindTextureUnit(binding, handle);
+    if (!view->GetSurfaceParams().IsBuffer()) {
+        glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
     }
-    // Apply swizzle to textures that are not buffers.
-    view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
-                       texture.tic.w_source);
-
-    glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
 }
 
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
-    const auto& maxwell3d = system.GPU().Maxwell3D();
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
     u32 binding = device.GetBaseBindings(stage_index).image;
     for (const auto& entry : shader->GetEntries().images) {
-        const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
+        const auto shader_type = static_cast<ShaderType>(stage_index);
         const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
         SetupImage(binding++, tic, entry);
     }
 }
 
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
-    const auto& compute = system.GPU().KeplerCompute();
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
     u32 binding = 0;
     for (const auto& entry : shader->GetEntries().images) {
-        const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
+        const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic;
         SetupImage(binding++, tic, entry);
     }
 }
@@ -894,27 +1120,43 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
         glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
         return;
     }
-    if (!tic.IsBuffer()) {
-        view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
-    }
-    if (entry.IsWritten()) {
+    if (entry.is_written) {
         view->MarkAsModified(texture_cache.Tick());
     }
-    glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
-                       view->GetFormat());
+    const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
 }
 
 void RasterizerOpenGL::SyncViewport() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     const bool dirty_viewport = flags[Dirty::Viewports];
+    const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+    if (dirty_clip_control || flags[Dirty::FrontFace]) {
+        flags[Dirty::FrontFace] = false;
+
+        GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+        if (regs.screen_y_control.triangle_rast_flip != 0 &&
+            regs.viewport_transform[0].scale_y < 0.0f) {
+            switch (mode) {
+            case GL_CW:
+                mode = GL_CCW;
+                break;
+            case GL_CCW:
+                mode = GL_CW;
+                break;
+            }
+        }
+        glFrontFace(mode);
+    }
+
     if (dirty_viewport || flags[Dirty::ClipControl]) {
         flags[Dirty::ClipControl] = false;
 
         bool flip_y = false;
-        if (regs.viewport_transform[0].scale_y < 0.0) {
+        if (regs.viewport_transform[0].scale_y < 0.0f) {
             flip_y = !flip_y;
         }
         if (regs.screen_y_control.y_negate != 0) {
@@ -946,34 +1188,36 @@ void RasterizerOpenGL::SyncViewport() {
             const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z;
             const GLdouble far_depth = src.translate_z + src.scale_z;
             glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth);
+
+            if (!GLAD_GL_NV_viewport_swizzle) {
+                continue;
+            }
+            glViewportSwizzleNV(static_cast<GLuint>(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x),
+                                MaxwellToGL::ViewportSwizzle(src.swizzle.y),
+                                MaxwellToGL::ViewportSwizzle(src.swizzle.z),
+                                MaxwellToGL::ViewportSwizzle(src.swizzle.w));
         }
     }
 }
 
 void RasterizerOpenGL::SyncDepthClamp() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::DepthClampEnabled]) {
         return;
     }
     flags[Dirty::DepthClampEnabled] = false;
 
-    const auto& state = gpu.regs.view_volume_clip_control;
-    UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near,
-                         "Unimplemented depth clamp separation!");
-
-    oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near);
+    oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0);
 }
 
 void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) {
         return;
     }
     flags[Dirty::ClipDistances] = false;
 
-    clip_mask &= gpu.regs.clip_distance_enabled;
+    clip_mask &= maxwell3d.regs.clip_distance_enabled;
     if (clip_mask == last_clip_distance_mask) {
         return;
     }
@@ -989,9 +1233,8 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     if (flags[Dirty::CullTest]) {
         flags[Dirty::CullTest] = false;
@@ -1003,34 +1246,27 @@ void RasterizerOpenGL::SyncCullMode() {
             glDisable(GL_CULL_FACE);
         }
     }
-
-    if (flags[Dirty::FrontFace]) {
-        flags[Dirty::FrontFace] = false;
-        glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
-    }
 }
 
 void RasterizerOpenGL::SyncPrimitiveRestart() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PrimitiveRestart]) {
         return;
     }
     flags[Dirty::PrimitiveRestart] = false;
 
-    if (gpu.regs.primitive_restart.enabled) {
+    if (maxwell3d.regs.primitive_restart.enabled) {
         glEnable(GL_PRIMITIVE_RESTART);
-        glPrimitiveRestartIndex(gpu.regs.primitive_restart.index);
+        glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index);
     } else {
         glDisable(GL_PRIMITIVE_RESTART);
     }
 }
 
 void RasterizerOpenGL::SyncDepthTestState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
-    const auto& regs = gpu.regs;
     if (flags[Dirty::DepthMask]) {
         flags[Dirty::DepthMask] = false;
         glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE);
@@ -1048,14 +1284,13 @@ void RasterizerOpenGL::SyncDepthTestState() {
 }
 
 void RasterizerOpenGL::SyncStencilTestState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::StencilTest]) {
         return;
     }
     flags[Dirty::StencilTest] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_STENCIL_TEST, regs.stencil_enable);
 
     glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func),
@@ -1080,25 +1315,24 @@ void RasterizerOpenGL::SyncStencilTestState() {
 }
 
 void RasterizerOpenGL::SyncRasterizeEnable() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::RasterizeEnable]) {
         return;
     }
     flags[Dirty::RasterizeEnable] = false;
 
-    oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0);
+    oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0);
 }
 
 void RasterizerOpenGL::SyncPolygonModes() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PolygonModes]) {
         return;
     }
     flags[Dirty::PolygonModes] = false;
 
-    if (gpu.regs.fill_rectangle) {
+    const auto& regs = maxwell3d.regs;
+    if (regs.fill_rectangle) {
         if (!GLAD_GL_NV_fill_rectangle) {
             LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported");
             glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
@@ -1111,27 +1345,26 @@ void RasterizerOpenGL::SyncPolygonModes() {
         return;
     }
 
-    if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) {
+    if (regs.polygon_mode_front == regs.polygon_mode_back) {
         flags[Dirty::PolygonModeFront] = false;
         flags[Dirty::PolygonModeBack] = false;
-        glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+        glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
         return;
     }
 
     if (flags[Dirty::PolygonModeFront]) {
         flags[Dirty::PolygonModeFront] = false;
-        glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+        glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
     }
 
     if (flags[Dirty::PolygonModeBack]) {
         flags[Dirty::PolygonModeBack] = false;
-        glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back));
+        glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back));
     }
 }
 
 void RasterizerOpenGL::SyncColorMask() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::ColorMasks]) {
         return;
     }
@@ -1140,7 +1373,7 @@ void RasterizerOpenGL::SyncColorMask() {
     const bool force = flags[Dirty::ColorMaskCommon];
     flags[Dirty::ColorMaskCommon] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.color_mask_common) {
         if (!force && !flags[Dirty::ColorMask0]) {
             return;
@@ -1165,33 +1398,30 @@ void RasterizerOpenGL::SyncColorMask() {
 }
 
 void RasterizerOpenGL::SyncMultiSampleState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::MultisampleControl]) {
         return;
     }
     flags[Dirty::MultisampleControl] = false;
 
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage);
     oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one);
 }
 
 void RasterizerOpenGL::SyncFragmentColorClampState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::FragmentClampColor]) {
         return;
     }
     flags[Dirty::FragmentClampColor] = false;
 
-    glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
+    glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
 }
 
 void RasterizerOpenGL::SyncBlendState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
-    const auto& regs = gpu.regs;
+    auto& flags = maxwell3d.dirty.flags;
+    const auto& regs = maxwell3d.regs;
 
     if (flags[Dirty::BlendColor]) {
         flags[Dirty::BlendColor] = false;
@@ -1248,14 +1478,13 @@ void RasterizerOpenGL::SyncBlendState() {
 }
 
 void RasterizerOpenGL::SyncLogicOpState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::LogicOp]) {
         return;
     }
     flags[Dirty::LogicOp] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.logic_op.enable) {
         glEnable(GL_COLOR_LOGIC_OP);
         glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation));
@@ -1265,14 +1494,13 @@ void RasterizerOpenGL::SyncLogicOpState() {
 }
 
 void RasterizerOpenGL::SyncScissorTest() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::Scissors]) {
         return;
     }
     flags[Dirty::Scissors] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) {
         if (!flags[Dirty::Scissor0 + index]) {
             continue;
@@ -1291,16 +1519,15 @@ void RasterizerOpenGL::SyncScissorTest() {
 }
 
 void RasterizerOpenGL::SyncPointState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PointSize]) {
         return;
     }
     flags[Dirty::PointSize] = false;
 
-    oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable);
+    oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
 
-    if (gpu.regs.vp_point_size.enable) {
+    if (maxwell3d.regs.vp_point_size.enable) {
         // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled.
         glEnable(GL_PROGRAM_POINT_SIZE);
         return;
@@ -1308,32 +1535,30 @@ void RasterizerOpenGL::SyncPointState() {
 
     // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
     // in OpenGL).
-    glPointSize(std::max(1.0f, gpu.regs.point_size));
+    glPointSize(std::max(1.0f, maxwell3d.regs.point_size));
     glDisable(GL_PROGRAM_POINT_SIZE);
 }
 
 void RasterizerOpenGL::SyncLineState() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::LineWidth]) {
         return;
     }
     flags[Dirty::LineWidth] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable);
     glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased);
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::PolygonOffset]) {
         return;
     }
     flags[Dirty::PolygonOffset] = false;
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable);
     oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable);
     oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable);
@@ -1347,18 +1572,13 @@ void RasterizerOpenGL::SyncPolygonOffset() {
 }
 
 void RasterizerOpenGL::SyncAlphaTest() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::AlphaTest]) {
         return;
     }
     flags[Dirty::AlphaTest] = false;
 
-    const auto& regs = gpu.regs;
-    if (regs.alpha_test_enabled && regs.rt_control.count > 1) {
-        LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested");
-    }
-
+    const auto& regs = maxwell3d.regs;
     if (regs.alpha_test_enabled) {
         glEnable(GL_ALPHA_TEST);
         glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);
@@ -1368,22 +1588,79 @@ void RasterizerOpenGL::SyncAlphaTest() {
 }
 
 void RasterizerOpenGL::SyncFramebufferSRGB() {
-    auto& gpu = system.GPU().Maxwell3D();
-    auto& flags = gpu.dirty.flags;
+    auto& flags = maxwell3d.dirty.flags;
     if (!flags[Dirty::FramebufferSRGB]) {
         return;
     }
     flags[Dirty::FramebufferSRGB] = false;
 
-    oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
+    oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb);
+}
+
+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = maxwell3d.regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
 }
 
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1410,11 +1687,15 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                           static_cast<GLsizeiptr>(size));
     }
 
+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
     glBeginTransformFeedback(GL_POINTS);
 }
 
 void RasterizerOpenGL::EndTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
@@ -1431,8 +1712,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 435da4425..1d0f585fa 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,10 +19,10 @@
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_fence_manager.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
 #include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -33,10 +33,11 @@
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/async_shaders.h"
 #include "video_core/textures/texture.h"
 
-namespace Core {
-class System;
+namespace Core::Memory {
+class Memory;
 }
 
 namespace Core::Frontend {
@@ -52,10 +53,18 @@ namespace OpenGL {
 struct ScreenInfo;
 struct DrawParameters;
 
+struct BindlessSSBO {
+    GLuint64EXT address;
+    GLsizei length;
+    GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
+
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
-    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info, GLShader::ProgramManager& program_manager,
+    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+                              Core::Memory::Memory& cpu_memory, const Device& device,
+                              ScreenInfo& screen_info, ProgramManager& program_manager,
                               StateTracker& state_tracker);
     ~RasterizerOpenGL() override;
 
@@ -66,8 +75,15 @@ public:
     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(VAddr addr, u64 size) override;
+    bool MustFlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
+    void OnCPUWrite(VAddr addr, u64 size) override;
+    void SyncGuestHost() override;
+    void SignalSemaphore(GPUVAddr addr, u32 value) override;
+    void SignalSyncPoint(u32 value) override;
+    void ReleaseFences() override;
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void WaitForIdle() override;
     void FlushCommands() override;
     void TickFrame() override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@@ -75,56 +91,65 @@ public:
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
-    void LoadDiskResources(const std::atomic_bool& stop_loading,
+    void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
-    void SetupDirtyFlags() override;
 
     /// Returns true when there are commands queued to the OpenGL server.
     bool AnyCommandQueued() const {
         return num_queued_commands > 0;
     }
 
+    VideoCommon::Shader::AsyncShaders& GetAsyncShaders() {
+        return async_shaders;
+    }
+
+    const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const {
+        return async_shaders;
+    }
+
 private:
     /// Configures the color and depth framebuffer states.
     void ConfigureFramebuffers();
 
-    void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb);
+    /// Configures the color and depth framebuffer for clearing.
+    void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
 
     /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(const Shader& kernel);
+    void SetupComputeConstBuffers(Shader* kernel);
 
     /// Configures a constant buffer.
-    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry);
+    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+                          const ConstBufferEntry& entry, bool use_unified,
+                          std::size_t unified_offset);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
 
     /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(const Shader& kernel);
+    void SetupComputeGlobalMemory(Shader* kernel);
 
-    /// Configures a constant buffer.
+    /// Configures a global memory buffer.
     void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
-                           std::size_t size);
+                           size_t size, BindlessSSBO* ssbo);
 
     /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+    void SetupDrawTextures(std::size_t stage_index, Shader* shader);
 
     /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(const Shader& kernel);
+    void SetupComputeTextures(Shader* kernel);
 
     /// Configures a texture.
     void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
                       const SamplerEntry& entry);
 
     /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+    void SetupDrawImages(std::size_t stage_index, Shader* shader);
 
     /// Configures images in a compute shader.
-    void SetupComputeImages(const Shader& shader);
+    void SetupComputeImages(Shader* shader);
 
     /// Configures an image.
     void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -192,6 +217,10 @@ private:
     /// Syncs the framebuffer sRGB state to match the guest state
     void SyncFramebufferSRGB();
 
+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
     /// Begin a transform feedback
     void BeginTransformFeedback(GLenum primitive_mode);
 
@@ -215,31 +244,42 @@ private:
 
     void SetupShaders(GLenum primitive_mode);
 
-    const Device device;
+    Tegra::GPU& gpu;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
+
+    const Device& device;
+    ScreenInfo& screen_info;
+    ProgramManager& program_manager;
+    StateTracker& state_tracker;
 
     TextureCacheOpenGL texture_cache;
     ShaderCacheOpenGL shader_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
     QueryCache query_cache;
+    OGLBufferCache buffer_cache;
+    FenceManagerOpenGL fence_manager;
 
-    Core::System& system;
-    ScreenInfo& screen_info;
-    GLShader::ProgramManager& program_manager;
-    StateTracker& state_tracker;
+    VideoCommon::Shader::AsyncShaders async_shaders;
 
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-    OGLBufferCache buffer_cache;
 
-    VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker};
-    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
-    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
+    GLint vertex_binding = 0;
 
     std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
         transform_feedback_buffers;
     std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
         enabled_transform_feedback_buffers;
 
+    static constexpr std::size_t NUM_CONSTANT_BUFFERS =
+        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
+    std::size_t current_cbuf = 0;
+    OGLBuffer unified_uniform_buffer;
+
     /// Number of commands queued to the OpenGL driver. Reseted on flush.
     std::size_t num_queued_commands = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 97803d480..0ebcec427 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <string_view>
 #include <utility>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -82,11 +83,13 @@ void OGLSampler::Release() {
     handle = 0;
 }
 
-void OGLShader::Create(const char* source, GLenum type) {
-    if (handle != 0)
+void OGLShader::Create(std::string_view source, GLenum type) {
+    if (handle != 0) {
         return;
-    if (source == nullptr)
+    }
+    if (source.empty()) {
         return;
+    }
 
     MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
     handle = GLShader::LoadShader(source, type);
@@ -125,6 +128,15 @@ void OGLProgram::Release() {
     handle = 0;
 }
 
+void OGLAssemblyProgram::Release() {
+    if (handle == 0) {
+        return;
+    }
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteProgramsARB(1, &handle);
+    handle = 0;
+}
+
 void OGLPipeline::Create() {
     if (handle != 0)
         return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index de93f4212..f48398669 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <string_view>
 #include <utility>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -127,7 +128,7 @@ public:
         return *this;
     }
 
-    void Create(const char* source, GLenum type);
+    void Create(std::string_view source, GLenum type);
 
     void Release();
 
@@ -167,6 +168,28 @@ public:
     GLuint handle = 0;
 };
 
+class OGLAssemblyProgram : private NonCopyable {
+public:
+    OGLAssemblyProgram() = default;
+
+    OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLAssemblyProgram() {
+        Release();
+    }
+
+    OGLAssemblyProgram& operator=(OGLAssemblyProgram&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLPipeline : private NonCopyable {
 public:
     OGLPipeline() = default;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 12c6dcfde..bd56bed0c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -10,8 +10,6 @@
 #include <thread>
 #include <unordered_set>
 
-#include <boost/functional/hash.hpp>
-
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
@@ -22,83 +20,35 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
+#include "video_core/shader_notify.h"
 
 namespace OpenGL {
 
 using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::CompileDepth;
-using VideoCommon::Shader::CompilerSettings;
+using VideoCommon::Shader::GetShaderAddress;
+using VideoCommon::Shader::GetShaderCode;
+using VideoCommon::Shader::GetUniqueIdentifier;
+using VideoCommon::Shader::KERNEL_MAIN_OFFSET;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::Registry;
 using VideoCommon::Shader::ShaderIR;
+using VideoCommon::Shader::STAGE_MAIN_OFFSET;
 
 namespace {
 
-constexpr u32 STAGE_MAIN_OFFSET = 10;
-constexpr u32 KERNEL_MAIN_OFFSET = 0;
-
-constexpr CompilerSettings COMPILER_SETTINGS{CompileDepth::FullDecompile};
-
-/// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
-    const auto& gpu{system.GPU().Maxwell3D()};
-    const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
-    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
-}
-
-/// Gets if the current instruction offset is a scheduler instruction
-constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
-    // Sched instructions appear once every 4 instructions.
-    constexpr std::size_t SchedPeriod = 4;
-    const std::size_t absolute_offset = offset - main_offset;
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
-/// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const ProgramCode& program) {
-    constexpr std::size_t start_offset = 10;
-    // This is the encoded version of BRA that jumps to itself. All Nvidia
-    // shaders end with one.
-    constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL;
-    constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL;
-    std::size_t offset = start_offset;
-    while (offset < program.size()) {
-        const u64 instruction = program[offset];
-        if (!IsSchedInstruction(offset, start_offset)) {
-            if ((instruction & mask) == self_jumping_branch) {
-                // End on Maxwell's "nop" instruction
-                break;
-            }
-            if (instruction == 0) {
-                break;
-            }
-        }
-        offset++;
-    }
-    // The last instruction is included in the program size
-    return std::min(offset + 1, program.size());
-}
-
-/// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
-                          const u8* host_ptr) {
-    ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
-    ASSERT_OR_EXECUTE(host_ptr != nullptr, {
-        std::fill(code.begin(), code.end(), 0);
-        return code;
-    });
-    memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64));
-    code.resize(CalculateProgramSize(code));
-    return code;
-}
+constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{};
 
 /// Gets the shader type from a Maxwell program type
 constexpr GLenum GetGLShaderType(ShaderType shader_type) {
@@ -116,17 +66,6 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) {
     }
 }
 
-/// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code,
-                        const ProgramCode& code_b = {}) {
-    u64 unique_identifier = boost::hash_value(code);
-    if (is_a) {
-        // VertexA programs include two programs
-        boost::hash_combine(unique_identifier, boost::hash_value(code_b));
-    }
-    return unique_identifier;
-}
-
 constexpr const char* GetShaderTypeName(ShaderType shader_type) {
     switch (shader_type) {
     case ShaderType::Vertex:
@@ -162,6 +101,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
     return {};
 }
 
+constexpr GLenum AssemblyEnum(ShaderType shader_type) {
+    switch (shader_type) {
+    case ShaderType::Vertex:
+        return GL_VERTEX_PROGRAM_NV;
+    case ShaderType::TesselationControl:
+        return GL_TESS_CONTROL_PROGRAM_NV;
+    case ShaderType::TesselationEval:
+        return GL_TESS_EVALUATION_PROGRAM_NV;
+    case ShaderType::Geometry:
+        return GL_GEOMETRY_PROGRAM_NV;
+    case ShaderType::Fragment:
+        return GL_FRAGMENT_PROGRAM_NV;
+    case ShaderType::Compute:
+        return GL_COMPUTE_PROGRAM_NV;
+    }
+    return {};
+}
+
 std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
     return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }
@@ -170,7 +127,7 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
     const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size};
     const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer,
                                                            entry.graphics_info, entry.compute_info};
-    const auto registry = std::make_shared<Registry>(entry.type, info);
+    auto registry = std::make_shared<Registry>(entry.type, info);
     for (const auto& [address, value] : entry.keys) {
         const auto [buffer, offset] = address;
         registry->InsertKey(buffer, offset, value);
@@ -185,21 +142,6 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
     return registry;
 }
 
-std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
-                                        u64 unique_identifier, const ShaderIR& ir,
-                                        const Registry& registry, bool hint_retrievable = false) {
-    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
-    LOG_INFO(Render_OpenGL, "{}", shader_id);
-
-    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
-    OGLShader shader;
-    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
-
-    auto program = std::make_shared<OGLProgram>();
-    program->Create(true, hint_retrievable, shader.handle);
-    return program;
-}
-
 std::unordered_set<GLenum> GetSupportedFormats() {
     GLint num_formats;
     glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);
@@ -216,55 +158,138 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
-    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program)} {}
+ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
+                             const ShaderIR& ir, const Registry& registry, bool hint_retrievable) {
+    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
+    LOG_INFO(Render_OpenGL, "{}", shader_id);
+
+    auto program = std::make_shared<ProgramHandle>();
+
+    if (device.UseAssemblyShaders()) {
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
+
+        GLuint& arb_prog = program->assembly_program.handle;
+
+// Commented out functions signal OpenGL errors but are compatible with apitrace.
+// Use them only to capture and replay on apitrace.
+#if 0
+        glGenProgramsNV(1, &arb_prog);
+        glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
+                        reinterpret_cast<const GLubyte*>(arb.data()));
+#else
+        glGenProgramsARB(1, &arb_prog);
+        glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
+                                static_cast<GLsizei>(arb.size()), arb.data());
+#endif
+        const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
+        if (err && *err) {
+            LOG_CRITICAL(Render_OpenGL, "{}", err);
+            LOG_INFO(Render_OpenGL, "\n{}", arb);
+        }
+    } else {
+        const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
+        OGLShader shader;
+        shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+
+        program->source_program.Create(true, hint_retrievable, shader.handle);
+    }
 
-CachedShader::~CachedShader() = default;
+    return program;
+}
 
-GLuint CachedShader::GetHandle() const {
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+               ProgramSharedPtr program_, bool is_built)
+    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)},
+      is_built(is_built) {
+    handle = program->assembly_program.handle;
+    if (handle == 0) {
+        handle = program->source_program.handle;
+    }
+    if (is_built) {
+        ASSERT(handle != 0);
+    }
+}
+
+Shader::~Shader() = default;
+
+GLuint Shader::GetHandle() const {
     DEBUG_ASSERT(registry->IsConsistent());
-    return program->handle;
+    return handle;
 }
 
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
-                                           Maxwell::ShaderProgram program_type, ProgramCode code,
-                                           ProgramCode code_b) {
+bool Shader::IsBuilt() const {
+    return is_built;
+}
+
+void Shader::AsyncOpenGLBuilt(OGLProgram new_program) {
+    program->source_program = std::move(new_program);
+    handle = program->source_program.handle;
+    is_built = true;
+}
+
+void Shader::AsyncGLASMBuilt(OGLAssemblyProgram new_program) {
+    program->assembly_program = std::move(new_program);
+    handle = program->assembly_program.handle;
+    is_built = true;
+}
+
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(
+    const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code,
+    ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) {
     const auto shader_type = GetShaderType(program_type);
-    const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
-    auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D());
-    const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
-    // TODO(Rodrigo): Handle VertexA shaders
-    // std::optional<ShaderIR> ir_b;
-    // if (!code_b.empty()) {
-    //     ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
-    // }
-    auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+    auto& gpu = params.gpu;
+    gpu.ShaderNotify().MarkSharderBuilding();
+
+    auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D());
+    if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) {
+        const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+        // TODO(Rodrigo): Handle VertexA shaders
+        // std::optional<ShaderIR> ir_b;
+        // if (!code_b.empty()) {
+        //     ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
+        // }
+        auto program =
+            BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+        ShaderDiskCacheEntry entry;
+        entry.type = shader_type;
+        entry.code = std::move(code);
+        entry.code_b = std::move(code_b);
+        entry.unique_identifier = params.unique_identifier;
+        entry.bound_buffer = registry->GetBoundBuffer();
+        entry.graphics_info = registry->GetGraphicsInfo();
+        entry.keys = registry->GetKeys();
+        entry.bound_samplers = registry->GetBoundSamplers();
+        entry.bindless_samplers = registry->GetBindlessSamplers();
+        params.disk_cache.SaveEntry(std::move(entry));
+
+        gpu.ShaderNotify().MarkShaderComplete();
+
+        return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                                  MakeEntries(params.device, ir, shader_type),
+                                                  std::move(program), true));
+    } else {
+        // Required for entries
+        const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+        auto entries = MakeEntries(params.device, ir, shader_type);
 
-    ShaderDiskCacheEntry entry;
-    entry.type = shader_type;
-    entry.code = std::move(code);
-    entry.code_b = std::move(code_b);
-    entry.unique_identifier = params.unique_identifier;
-    entry.bound_buffer = registry->GetBoundBuffer();
-    entry.graphics_info = registry->GetGraphicsInfo();
-    entry.keys = registry->GetKeys();
-    entry.bound_samplers = registry->GetBoundSamplers();
-    entry.bindless_samplers = registry->GetBindlessSamplers();
-    params.disk_cache.SaveEntry(std::move(entry));
+        async_shaders.QueueOpenGLShader(params.device, shader_type, params.unique_identifier,
+                                        std::move(code), std::move(code_b), STAGE_MAIN_OFFSET,
+                                        COMPILER_SETTINGS, *registry, cpu_addr);
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+        auto program = std::make_shared<ProgramHandle>();
+        return std::unique_ptr<Shader>(
+            new Shader(std::move(registry), std::move(entries), std::move(program), false));
+    }
 }
 
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
-    const std::size_t size_in_bytes = code.size() * sizeof(u64);
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+                                                       ProgramCode code) {
+    auto& gpu = params.gpu;
+    gpu.ShaderNotify().MarkSharderBuilding();
 
-    auto& engine = params.system.GPU().KeplerCompute();
-    auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
+    auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine);
     const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
     const u64 uid = params.unique_identifier;
     auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
@@ -280,31 +305,43 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    gpu.ShaderNotify().MarkShaderComplete();
+
+    return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                              MakeEntries(params.device, ir, ShaderType::Compute),
+                                              std::move(program)));
 }
 
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const PrecompiledShader& precompiled_shader,
-                                     std::size_t size_in_bytes) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
-                         precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+                                                const PrecompiledShader& precompiled_shader) {
+    return std::unique_ptr<Shader>(new Shader(
+        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }
 
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                                     Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
-      disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer,
+                                     Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
+                                     Tegra::Engines::Maxwell3D& maxwell3d_,
+                                     Tegra::Engines::KeplerCompute& kepler_compute_,
+                                     Tegra::MemoryManager& gpu_memory_, const Device& device_)
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_},
+      gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_},
+      kepler_compute{kepler_compute_}, device{device_} {}
 
-void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
+
+void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
+    disk_cache.BindTitleID(title_id);
     const std::optional transferable = disk_cache.LoadTransferable();
     if (!transferable) {
         return;
     }
 
-    const std::vector gl_cache = disk_cache.LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> gl_cache;
+    if (!device.UseAssemblyShaders()) {
+        // Only load precompiled cache when we are not using assembly shaders
+        gl_cache = disk_cache.LoadPrecompiled();
+    }
     const auto supported_formats = GetSupportedFormats();
 
     // Track if precompiled cache was altered during loading to know if we have to
@@ -343,7 +380,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             auto registry = MakeRegistry(entry);
             const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
 
-            std::shared_ptr<OGLProgram> program;
+            ProgramSharedPtr program;
             if (precompiled_entry) {
                 // If the shader is precompiled, attempt to load it with
                 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -359,7 +396,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             PrecompiledShader shader;
             shader.program = std::move(program);
             shader.registry = std::move(registry);
-            shader.entries = MakeEntries(ir);
+            shader.entries = MakeEntries(device, ir, entry.type);
 
             std::scoped_lock lock{mutex};
             if (callback) {
@@ -370,7 +407,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         }
     };
 
-    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
+    const std::size_t num_workers{std::max(1U, std::thread::hardware_concurrency())};
     const std::size_t bucket_size{transferable->size() / num_workers};
     std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
     std::vector<std::thread> threads(num_workers);
@@ -397,6 +434,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        // Don't store precompiled binaries for assembly shaders.
+        return;
+    }
+
     // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
     // before precompiling them
 
@@ -404,7 +446,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         const u64 id = (*transferable)[i].unique_identifier;
         const auto it = find_precompiled(id);
         if (it == gl_cache.end()) {
-            const GLuint program = runtime_cache.at(id).program->handle;
+            const GLuint program = runtime_cache.at(id).program->source_program.handle;
             disk_cache.SavePrecompiled(id, program);
             precompiled_cache_altered = true;
         }
@@ -415,7 +457,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
     }
 }
 
-std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
     const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
     const std::unordered_set<GLenum>& supported_formats) {
     if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -423,15 +465,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
         return {};
     }
 
-    auto program = std::make_shared<OGLProgram>();
-    program->handle = glCreateProgram();
-    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(program->handle, precompiled_entry.binary_format,
-                    precompiled_entry.binary.data(),
+    auto program = std::make_shared<ProgramHandle>();
+    GLuint& handle = program->source_program.handle;
+    handle = glCreateProgram();
+    glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
                     static_cast<GLsizei>(precompiled_entry.binary.size()));
 
     GLint link_status;
-    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
+    glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
     if (link_status == GL_FALSE) {
         LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
         return {};
@@ -440,77 +482,122 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
     return program;
 }
 
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
-        return last_shaders[static_cast<std::size_t>(program)];
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
+                                           VideoCommon::Shader::AsyncShaders& async_shaders) {
+    if (!maxwell3d.dirty.flags[Dirty::Shaders]) {
+        auto* last_shader = last_shaders[static_cast<std::size_t>(program)];
+        if (last_shader->IsBuilt()) {
+            return last_shader;
+        }
     }
 
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const GPUVAddr address{GetShaderAddress(system, program)};
+    const GPUVAddr address{GetShaderAddress(maxwell3d, program)};
+
+    if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) {
+        auto completed_work = async_shaders.GetCompletedWork();
+        for (auto& work : completed_work) {
+            Shader* shader = TryGet(work.cpu_address);
+            gpu.ShaderNotify().MarkShaderComplete();
+            if (shader == nullptr) {
+                continue;
+            }
+            using namespace VideoCommon::Shader;
+            if (work.backend == AsyncShaders::Backend::OpenGL) {
+                shader->AsyncOpenGLBuilt(std::move(work.program.opengl));
+            } else if (work.backend == AsyncShaders::Backend::GLASM) {
+                shader->AsyncGLASMBuilt(std::move(work.program.glasm));
+            }
+
+            auto& registry = shader->GetRegistry();
+
+            ShaderDiskCacheEntry entry;
+            entry.type = work.shader_type;
+            entry.code = std::move(work.code);
+            entry.code_b = std::move(work.code_b);
+            entry.unique_identifier = work.uid;
+            entry.bound_buffer = registry.GetBoundBuffer();
+            entry.graphics_info = registry.GetGraphicsInfo();
+            entry.keys = registry.GetKeys();
+            entry.bound_samplers = registry.GetBoundSamplers();
+            entry.bindless_samplers = registry.GetBindlessSamplers();
+            disk_cache.SaveEntry(std::move(entry));
+        }
+    }
 
     // Look up shader in the cache based on address
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
-    Shader shader{cpu_addr ? TryGet(*cpu_addr) : nullptr};
-    if (shader) {
+    const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)};
+    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
         return last_shaders[static_cast<std::size_t>(program)] = shader;
     }
 
-    const auto host_ptr{memory_manager.GetPointer(address)};
+    const u8* const host_ptr{gpu_memory.GetPointer(address)};
 
     // No shader found - create a new one
-    ProgramCode code{GetShaderCode(memory_manager, address, host_ptr)};
+    ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)};
     ProgramCode code_b;
     if (program == Maxwell::ShaderProgram::VertexA) {
-        const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
-        code_b = GetShaderCode(memory_manager, address_b, memory_manager.GetPointer(address_b));
+        const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)};
+        const u8* host_ptr_b = gpu_memory.GetPointer(address_b);
+        code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
-    const ShaderParameters params{system,    disk_cache, device,
-                                  *cpu_addr, host_ptr,   unique_identifier};
+    const ShaderParameters params{gpu,       maxwell3d, disk_cache,       device,
+                                  *cpu_addr, host_ptr,  unique_identifier};
 
+    std::unique_ptr<Shader> shader;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
-                                                     std::move(code_b));
+        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b),
+                                               async_shaders, cpu_addr.value_or(0));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        shader = Shader::CreateFromCache(params, found->second);
     }
-    Register(shader);
 
-    return last_shaders[static_cast<std::size_t>(program)] = shader;
+    Shader* const result = shader.get();
+    if (cpu_addr) {
+        Register(std::move(shader), *cpu_addr, code_size);
+    } else {
+        null_shader = std::move(shader);
+    }
+
+    return last_shaders[static_cast<std::size_t>(program)] = result;
 }
 
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+    const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)};
 
-    auto kernel = cpu_addr ? TryGet(*cpu_addr) : nullptr;
-    if (kernel) {
+    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
         return kernel;
     }
 
-    const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    const u8* host_ptr{gpu_memory.GetPointer(code_addr)};
+    ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
-    const ShaderParameters params{system,    disk_cache, device,
-                                  *cpu_addr, host_ptr,   unique_identifier};
+    const ShaderParameters params{gpu,       kepler_compute, disk_cache,       device,
+                                  *cpu_addr, host_ptr,       unique_identifier};
 
+    std::unique_ptr<Shader> kernel;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+        kernel = Shader::CreateKernelFromMemory(params, std::move(code));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        kernel = Shader::CreateFromCache(params, found->second);
     }
 
-    Register(kernel);
-    return kernel;
+    Shader* const result = kernel.get();
+    if (cpu_addr) {
+        Register(std::move(kernel), *cpu_addr, code_size);
+    } else {
+        null_kernel = std::move(kernel);
+    }
+    return result;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index c836df5bd..1708af06a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,114 +18,143 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
-namespace Core {
-class System;
+namespace Tegra {
+class MemoryManager;
 }
 
 namespace Core::Frontend {
 class EmuWindow;
 }
 
+namespace VideoCommon::Shader {
+class AsyncShaders;
+}
+
 namespace OpenGL {
 
-class CachedShader;
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
+struct ProgramHandle {
+    OGLProgram source_program;
+    OGLAssemblyProgram assembly_program;
+};
+using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
+
 struct PrecompiledShader {
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
 };
 
 struct ShaderParameters {
-    Core::System& system;
+    Tegra::GPU& gpu;
+    Tegra::Engines::ConstBufferEngineInterface& engine;
     ShaderDiskCacheOpenGL& disk_cache;
     const Device& device;
     VAddr cpu_addr;
-    u8* host_ptr;
+    const u8* host_ptr;
     u64 unique_identifier;
 };
 
-class CachedShader final : public RasterizerCacheObject {
+ProgramSharedPtr BuildShader(const Device& device, Tegra::Engines::ShaderType shader_type,
+                             u64 unique_identifier, const VideoCommon::Shader::ShaderIR& ir,
+                             const VideoCommon::Shader::Registry& registry,
+                             bool hint_retrievable = false);
+
+class Shader final {
 public:
-    ~CachedShader();
+    ~Shader();
 
     /// Gets the GL program handle for the shader
     GLuint GetHandle() const;
 
-    /// Returns the size in bytes of the shader
-    std::size_t GetSizeInBytes() const override {
-        return size_in_bytes;
-    }
+    bool IsBuilt() const;
 
     /// Gets the shader entries for the shader
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
-    static Shader CreateStageFromMemory(const ShaderParameters& params,
-                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode program_code, ProgramCode program_code_b);
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+    const VideoCommon::Shader::Registry& GetRegistry() const {
+        return *registry;
+    }
+
+    /// Mark a OpenGL shader as built
+    void AsyncOpenGLBuilt(OGLProgram new_program);
+
+    /// Mark a GLASM shader as built
+    void AsyncGLASMBuilt(OGLAssemblyProgram new_program);
+
+    static std::unique_ptr<Shader> CreateStageFromMemory(
+        const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+        ProgramCode program_code, ProgramCode program_code_b,
+        VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr);
+
+    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+                                                          ProgramCode code);
 
-    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const PrecompiledShader& precompiled_shader,
-                                  std::size_t size_in_bytes);
+    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+                                                   const PrecompiledShader& precompiled_shader);
 
 private:
-    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);
+    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+                    ProgramSharedPtr program, bool is_built = true);
 
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
-    std::size_t size_in_bytes = 0;
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
+    GLuint handle = 0;
+    bool is_built{};
 };
 
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
-    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                               Core::Frontend::EmuWindow& emu_window, const Device& device);
+    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window,
+                               Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d,
+                               Tegra::Engines::KeplerCompute& kepler_compute,
+                               Tegra::MemoryManager& gpu_memory, const Device& device);
+    ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
-    void LoadDiskCache(const std::atomic_bool& stop_loading,
+    void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
-    Shader GetStageProgram(Maxwell::ShaderProgram program);
+    Shader* GetStageProgram(Maxwell::ShaderProgram program,
+                            VideoCommon::Shader::AsyncShaders& async_shaders);
 
     /// Gets a compute kernel in the passed address
-    Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const Shader& object) override {}
+    Shader* GetComputeKernel(GPUVAddr code_addr);
 
 private:
-    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+    ProgramSharedPtr GeneratePrecompiledProgram(
         const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
         const std::unordered_set<GLenum>& supported_formats);
 
-    Core::System& system;
     Core::Frontend::EmuWindow& emu_window;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
     const Device& device;
+
     ShaderDiskCacheOpenGL disk_cache;
     std::unordered_map<u64, PrecompiledShader> runtime_cache;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
+
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index b1804e9ea..95ca96c8e 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -61,8 +62,8 @@ struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
 
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
 
 constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
@@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }
 
+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+    // We waste one UBO for emulation
+    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+    return num_ubos > num_available_ubos;
+}
+
 struct GenericVaryingDescription {
     std::string name;
     u8 first_element = 0;
@@ -412,8 +420,9 @@ class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                             ShaderType stage, std::string_view identifier, std::string_view suffix)
-        : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+          suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+                                                      UseUnifiedUniforms(device, ir, stage)} {
         if (stage != ShaderType::Compute) {
             transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
         }
@@ -484,7 +493,7 @@ private:
         code.AddLine("switch (jmp_to) {{");
 
         for (const auto& pair : ir.GetBasicBlocks()) {
-            const auto [address, bb] = pair;
+            const auto& [address, bb] = pair;
             code.AddLine("case 0x{:X}U: {{", address);
             ++code.scope;
 
@@ -518,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -590,8 +602,15 @@ private:
             return;
         }
         const auto& info = registry.GetComputeInfo();
-        if (const u32 size = info.shared_memory_size_in_words; size > 0) {
-            code.AddLine("shared uint smem[{}];", size);
+        if (u32 size = info.shared_memory_size_in_words * 4; size > 0) {
+            const u32 limit = device.GetMaxComputeSharedMemorySize();
+            if (size > limit) {
+                LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}",
+                          size, limit);
+                size = limit;
+            }
+
+            code.AddLine("shared uint smem[{}];", size / 4);
             code.AddNewLine();
         }
         code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;",
@@ -618,7 +637,9 @@ private:
                 break;
             }
         }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
@@ -647,6 +668,16 @@ private:
         --code.scope;
         code.AddLine("}};");
         code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
     }
 
     void DeclareRegisters() {
@@ -782,7 +813,7 @@ private:
         const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
         const auto it = transform_feedback.find(location);
         if (it == transform_feedback.end()) {
-            return {};
+            return std::nullopt;
         }
         return it->second.components;
     }
@@ -834,11 +865,24 @@ private:
     }
 
     void DeclareConstantBuffers() {
+        if (use_unified_uniforms) {
+            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+                                static_cast<u32>(ir.GetGlobalMemory().size());
+            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+                         binding);
+            code.AddLine("    uint cbufs[];");
+            code.AddLine("}};");
+            code.AddNewLine();
+            return;
+        }
+
         u32 binding = device.GetBaseBindings(stage).uniform_buffer;
-        for (const auto& [index, cbuf] : ir.GetConstantBuffers()) {
+        for (const auto [index, info] : ir.GetConstantBuffers()) {
+            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+            const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
             code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
                          GetConstBufferBlock(index));
-            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), size);
             code.AddLine("}};");
             code.AddNewLine();
         }
@@ -869,37 +913,37 @@ private:
         for (const auto& sampler : ir.GetSamplers()) {
             const std::string name = GetSampler(sampler);
             const std::string description = fmt::format("layout (binding = {}) uniform", binding);
-            binding += sampler.IsIndexed() ? sampler.Size() : 1;
+            binding += sampler.is_indexed ? sampler.size : 1;
 
             std::string sampler_type = [&]() {
-                if (sampler.IsBuffer()) {
+                if (sampler.is_buffer) {
                     return "samplerBuffer";
                 }
-                switch (sampler.GetType()) {
-                case Tegra::Shader::TextureType::Texture1D:
+                switch (sampler.type) {
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
                     return "sampler2D";
                 }
             }();
-            if (sampler.IsArray()) {
+            if (sampler.is_array) {
                 sampler_type += "Array";
             }
-            if (sampler.IsShadow()) {
+            if (sampler.is_shadow) {
                 sampler_type += "Shadow";
             }
 
-            if (!sampler.IsIndexed()) {
+            if (!sampler.is_indexed) {
                 code.AddLine("{} {} {};", description, sampler_type, name);
             } else {
-                code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size());
+                code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.size);
             }
         }
         if (!ir.GetSamplers().empty()) {
@@ -945,14 +989,14 @@ private:
         u32 binding = device.GetBaseBindings(stage).image;
         for (const auto& image : ir.GetImages()) {
             std::string qualifier = "coherent volatile";
-            if (image.IsRead() && !image.IsWritten()) {
+            if (image.is_read && !image.is_written) {
                 qualifier += " readonly";
-            } else if (image.IsWritten() && !image.IsRead()) {
+            } else if (image.is_written && !image.is_read) {
                 qualifier += " writeonly";
             }
 
-            const char* format = image.IsAtomic() ? "r32ui, " : "";
-            const char* type_declaration = GetImageTypeDeclaration(image.GetType());
+            const char* format = image.is_atomic ? "r32ui, " : "";
+            const char* type_declaration = GetImageTypeDeclaration(image.type);
             code.AddLine("layout ({}binding = {}) {} uniform uimage{} {};", format, binding++,
                          qualifier, type_declaration, GetImage(image));
         }
@@ -1037,42 +1081,51 @@ private:
 
         if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
             const Node offset = cbuf->GetOffset();
+            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
             if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
-                        Type::Uint};
+                if (use_unified_uniforms) {
+                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+                            Type::Uint};
+                } else {
+                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                            Type::Uint};
+                }
             }
 
-            if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+            // Indirect access
+            if (use_unified_uniforms) {
+                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+                                    Visit(offset).AsUint()),
+                        Type::Uint};
+            }
 
-                if (!device.HasComponentIndexingBug()) {
-                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                        final_offset, final_offset),
-                            Type::Uint};
-                }
+            const std::string final_offset = code.GenerateTemporary();
+            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
 
-                // AMD's proprietary GLSL compiler emits ill code for variable component access.
-                // To bypass this driver bug generate 4 ifs, one per each component.
-                const std::string pack = code.GenerateTemporary();
-                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
-                             final_offset);
-
-                const std::string result = code.GenerateTemporary();
-                code.AddLine("uint {};", result);
-                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
-                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
-                                 pack, GetSwizzle(swizzle));
-                }
-                return {result, Type::Uint};
+            if (!device.HasComponentIndexingBug()) {
+                return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                    final_offset, final_offset),
+                        Type::Uint};
             }
 
-            UNREACHABLE_MSG("Unmanaged offset node type");
+            // AMD's proprietary GLSL compiler emits ill code for variable component access.
+            // To bypass this driver bug generate 4 ifs, one per each component.
+            const std::string pack = code.GenerateTemporary();
+            code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                         final_offset);
+
+            const std::string result = code.GenerateTemporary();
+            code.AddLine("uint {};", result);
+            for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+                             GetSwizzle(swizzle));
+            }
+            return {result, Type::Uint};
         }
 
         if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1144,6 +1197,7 @@ private:
                 return {"gl_FragCoord"s + GetSwizzle(element), Type::Float};
             default:
                 UNREACHABLE();
+                return {"0", Type::Int};
             }
         case Attribute::Index::FrontColor:
             return {"gl_Color"s + GetSwizzle(element), Type::Float};
@@ -1241,21 +1295,21 @@ private:
             switch (element) {
             case 0:
                 UNIMPLEMENTED();
-                return {};
+                return std::nullopt;
             case 1:
                 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
-                    return {};
+                    return std::nullopt;
                 }
                 return {{"gl_Layer", Type::Int}};
             case 2:
                 if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
-                    return {};
+                    return std::nullopt;
                 }
                 return {{"gl_ViewportIndex", Type::Int}};
             case 3:
                 return {{"gl_PointSize", Type::Float}};
             }
-            return {};
+            return std::nullopt;
         case Attribute::Index::FrontColor:
             return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}};
         case Attribute::Index::FrontSecondaryColor:
@@ -1278,7 +1332,7 @@ private:
                          Type::Float}};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
-            return {};
+            return std::nullopt;
         }
     }
 
@@ -1335,16 +1389,27 @@ private:
         ASSERT(meta);
 
         const std::size_t count = operation.GetOperandsCount();
-        const bool has_array = meta->sampler.IsArray();
-        const bool has_shadow = meta->sampler.IsShadow();
+        const bool has_array = meta->sampler.is_array;
+        const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
             expr += "Offsets";
         }
-        if (!meta->sampler.IsIndexed()) {
+        if (!meta->sampler.is_indexed) {
             expr += '(' + GetSampler(meta->sampler) + ", ";
         } else {
             expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], ";
@@ -1372,6 +1437,18 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            default:
+                UNREACHABLE();
+                break;
+            }
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -1482,8 +1559,8 @@ private:
         dy += '(';
 
         for (std::size_t index = 0; index < components; ++index) {
-            const auto operand_x{derivates.at(index * 2)};
-            const auto operand_y{derivates.at(index * 2 + 1)};
+            const auto& operand_x{derivates.at(index * 2)};
+            const auto& operand_y{derivates.at(index * 2 + 1)};
             dx += Visit(operand_x).AsFloat();
             dy += Visit(operand_y).AsFloat();
 
@@ -1536,7 +1613,9 @@ private:
         Expression target;
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit the source
+                // as it might have side effects.
+                code.AddLine("{};", Visit(src).GetCode());
                 return {};
             }
             target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -1838,38 +1917,48 @@ private:
                 Type::HalfFloat};
     }
 
-    template <Type type>
-    Expression LogicalLessThan(Operation operation) {
-        return GenerateBinaryInfix(operation, "<", Type::Bool, type, type);
-    }
-
-    template <Type type>
-    Expression LogicalEqual(Operation operation) {
-        return GenerateBinaryInfix(operation, "==", Type::Bool, type, type);
-    }
+    template <const std::string_view& op, Type type, bool unordered = false>
+    Expression Comparison(Operation operation) {
+        static_assert(!unordered || type == Type::Float);
 
-    template <Type type>
-    Expression LogicalLessEqual(Operation operation) {
-        return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type);
-    }
+        Expression expr = GenerateBinaryInfix(operation, op, Type::Bool, type, type);
 
-    template <Type type>
-    Expression LogicalGreaterThan(Operation operation) {
-        return GenerateBinaryInfix(operation, ">", Type::Bool, type, type);
+        if constexpr (op.compare("!=") == 0 && type == Type::Float && !unordered) {
+            // GLSL's operator!=(float, float) doesn't seem be ordered. This happens on both AMD's
+            // and Nvidia's proprietary stacks. Manually force an ordered comparison.
+            return {fmt::format("({} && !isnan({}) && !isnan({}))", expr.AsBool(),
+                                VisitOperand(operation, 0).AsFloat(),
+                                VisitOperand(operation, 1).AsFloat()),
+                    Type::Bool};
+        }
+        if constexpr (!unordered) {
+            return expr;
+        }
+        // Unordered comparisons are always true for NaN operands.
+        return {fmt::format("({} || isnan({}) || isnan({}))", expr.AsBool(),
+                            VisitOperand(operation, 0).AsFloat(),
+                            VisitOperand(operation, 1).AsFloat()),
+                Type::Bool};
     }
 
-    template <Type type>
-    Expression LogicalNotEqual(Operation operation) {
-        return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type);
+    Expression FOrdered(Operation operation) {
+        return {fmt::format("(!isnan({}) && !isnan({}))", VisitOperand(operation, 0).AsFloat(),
+                            VisitOperand(operation, 1).AsFloat()),
+                Type::Bool};
     }
 
-    template <Type type>
-    Expression LogicalGreaterEqual(Operation operation) {
-        return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type);
+    Expression FUnordered(Operation operation) {
+        return {fmt::format("(isnan({}) || isnan({}))", VisitOperand(operation, 0).AsFloat(),
+                            VisitOperand(operation, 1).AsFloat()),
+                Type::Bool};
     }
 
-    Expression LogicalFIsNan(Operation operation) {
-        return GenerateUnary(operation, "isnan", Type::Bool, Type::Float);
+    Expression LogicalAddCarry(Operation operation) {
+        const std::string carry = code.GenerateTemporary();
+        code.AddLine("uint {};", carry);
+        code.AddLine("uaddCarry({}, {}, {});", VisitOperand(operation, 0).AsUint(),
+                     VisitOperand(operation, 1).AsUint(), carry);
+        return {fmt::format("({} != 0)", carry), Type::Bool};
     }
 
     Expression LogicalAssign(Operation operation) {
@@ -1967,24 +2056,39 @@ private:
     }
 
     Expression Texture(Operation operation) {
-        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
-        ASSERT(meta);
-
-        std::string expr = GenerateTexture(
-            operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}});
-        if (meta->sampler.IsShadow()) {
-            expr = "vec4(" + expr + ')';
+        const auto meta = std::get<MetaTexture>(operation.GetMeta());
+        const bool separate_dc = meta.sampler.type == TextureType::TextureCube &&
+                                 meta.sampler.is_array && meta.sampler.is_shadow;
+        // TODO: Replace this with an array and make GenerateTexture use C++20 std::span
+        const std::vector<TextureIR> extras{
+            TextureOffset{},
+            TextureArgument{Type::Float, meta.bias},
+        };
+        std::string expr = GenerateTexture(operation, "", extras, separate_dc);
+        if (meta.sampler.is_shadow) {
+            expr = fmt::format("vec4({})", expr);
         }
-        return {expr + GetSwizzle(meta->element), Type::Float};
+        return {expr + GetSwizzle(meta.element), Type::Float};
     }
 
     Expression TextureLod(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
-        if (meta->sampler.IsShadow()) {
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
+        if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
         return {expr + GetSwizzle(meta->element), Type::Float};
@@ -1993,11 +2097,11 @@ private:
     Expression TextureGather(Operation operation) {
         const auto& meta = std::get<MetaTexture>(operation.GetMeta());
 
-        const auto type = meta.sampler.IsShadow() ? Type::Float : Type::Int;
-        const bool separate_dc = meta.sampler.IsShadow();
+        const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int;
+        const bool separate_dc = meta.sampler.is_shadow;
 
         std::vector<TextureIR> ir;
-        if (meta.sampler.IsShadow()) {
+        if (meta.sampler.is_shadow) {
             ir = {TextureOffset{}};
         } else {
             ir = {TextureOffset{}, TextureArgument{type, meta.component}};
@@ -2042,7 +2146,7 @@ private:
         constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"};
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
-        UNIMPLEMENTED_IF(meta->sampler.IsArray());
+        UNIMPLEMENTED_IF(meta->sampler.is_array);
         const std::size_t count = operation.GetOperandsCount();
 
         std::string expr = "texelFetch(";
@@ -2063,7 +2167,7 @@ private:
         }
         expr += ')';
 
-        if (meta->lod && !meta->sampler.IsBuffer()) {
+        if (meta->lod && !meta->sampler.is_buffer) {
             expr += ", ";
             expr += Visit(meta->lod).AsInt();
         }
@@ -2074,12 +2178,10 @@ private:
     }
 
     Expression TextureGradient(Operation operation) {
-        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
-        ASSERT(meta);
-
+        const auto& meta = std::get<MetaTexture>(operation.GetMeta());
         std::string expr =
             GenerateTexture(operation, "Grad", {TextureDerivates{}, TextureOffset{}});
-        return {std::move(expr) + GetSwizzle(meta->element), Type::Float};
+        return {std::move(expr) + GetSwizzle(meta.element), Type::Float};
     }
 
     Expression ImageLoad(Operation operation) {
@@ -2295,6 +2397,18 @@ private:
         return {"gl_SubGroupInvocationARB", Type::Uint};
     }
 
+    template <const std::string_view& comparison>
+    Expression ThreadMask(Operation) {
+        if (device.HasWarpIntrinsics()) {
+            return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
+        }
+        if (device.HasShaderBallot()) {
+            return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
+        }
+        LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
+        return {"0U", Type::Uint};
+    }
+
     Expression ShuffleIndexed(Operation operation) {
         std::string value = VisitOperand(operation, 0).AsFloat();
 
@@ -2307,7 +2421,21 @@ private:
         return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
     }
 
-    Expression MemoryBarrierGL(Operation) {
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
+            return {};
+        }
+        code.AddLine("barrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGroup(Operation) {
+        code.AddLine("groupMemoryBarrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGlobal(Operation) {
         code.AddLine("memoryBarrier();");
         return {};
     }
@@ -2316,6 +2444,19 @@ private:
         Func() = delete;
         ~Func() = delete;
 
+        static constexpr std::string_view LessThan = "<";
+        static constexpr std::string_view Equal = "==";
+        static constexpr std::string_view LessEqual = "<=";
+        static constexpr std::string_view GreaterThan = ">";
+        static constexpr std::string_view NotEqual = "!=";
+        static constexpr std::string_view GreaterEqual = ">=";
+
+        static constexpr std::string_view Eq = "Eq";
+        static constexpr std::string_view Ge = "Ge";
+        static constexpr std::string_view Gt = "Gt";
+        static constexpr std::string_view Le = "Le";
+        static constexpr std::string_view Lt = "Lt";
+
         static constexpr std::string_view Add = "Add";
         static constexpr std::string_view Min = "Min";
         static constexpr std::string_view Max = "Max";
@@ -2417,27 +2558,36 @@ private:
         &GLSLDecompiler::LogicalPick2,
         &GLSLDecompiler::LogicalAnd2,
 
-        &GLSLDecompiler::LogicalLessThan<Type::Float>,
-        &GLSLDecompiler::LogicalEqual<Type::Float>,
-        &GLSLDecompiler::LogicalLessEqual<Type::Float>,
-        &GLSLDecompiler::LogicalGreaterThan<Type::Float>,
-        &GLSLDecompiler::LogicalNotEqual<Type::Float>,
-        &GLSLDecompiler::LogicalGreaterEqual<Type::Float>,
-        &GLSLDecompiler::LogicalFIsNan,
-
-        &GLSLDecompiler::LogicalLessThan<Type::Int>,
-        &GLSLDecompiler::LogicalEqual<Type::Int>,
-        &GLSLDecompiler::LogicalLessEqual<Type::Int>,
-        &GLSLDecompiler::LogicalGreaterThan<Type::Int>,
-        &GLSLDecompiler::LogicalNotEqual<Type::Int>,
-        &GLSLDecompiler::LogicalGreaterEqual<Type::Int>,
-
-        &GLSLDecompiler::LogicalLessThan<Type::Uint>,
-        &GLSLDecompiler::LogicalEqual<Type::Uint>,
-        &GLSLDecompiler::LogicalLessEqual<Type::Uint>,
-        &GLSLDecompiler::LogicalGreaterThan<Type::Uint>,
-        &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
-        &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, false>,
+        &GLSLDecompiler::Comparison<Func::Equal, Type::Float, false>,
+        &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, false>,
+        &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, false>,
+        &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, false>,
+        &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, false>,
+        &GLSLDecompiler::FOrdered,
+        &GLSLDecompiler::FUnordered,
+        &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, true>,
+        &GLSLDecompiler::Comparison<Func::Equal, Type::Float, true>,
+        &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, true>,
+        &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, true>,
+        &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, true>,
+        &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, true>,
+
+        &GLSLDecompiler::Comparison<Func::LessThan, Type::Int>,
+        &GLSLDecompiler::Comparison<Func::Equal, Type::Int>,
+        &GLSLDecompiler::Comparison<Func::LessEqual, Type::Int>,
+        &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Int>,
+        &GLSLDecompiler::Comparison<Func::NotEqual, Type::Int>,
+        &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Int>,
+
+        &GLSLDecompiler::Comparison<Func::LessThan, Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::Equal, Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::LessEqual, Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::NotEqual, Type::Uint>,
+        &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Uint>,
+
+        &GLSLDecompiler::LogicalAddCarry,
 
         &GLSLDecompiler::Logical2HLessThan<false>,
         &GLSLDecompiler::Logical2HEqual<false>,
@@ -2524,9 +2674,16 @@ private:
         &GLSLDecompiler::VoteEqual,
 
         &GLSLDecompiler::ThreadId,
+        &GLSLDecompiler::ThreadMask<Func::Eq>,
+        &GLSLDecompiler::ThreadMask<Func::Ge>,
+        &GLSLDecompiler::ThreadMask<Func::Gt>,
+        &GLSLDecompiler::ThreadMask<Func::Le>,
+        &GLSLDecompiler::ThreadMask<Func::Lt>,
         &GLSLDecompiler::ShuffleIndexed,
 
-        &GLSLDecompiler::MemoryBarrierGL,
+        &GLSLDecompiler::Barrier,
+        &GLSLDecompiler::MemoryBarrierGroup,
+        &GLSLDecompiler::MemoryBarrierGlobal,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
@@ -2596,11 +2753,11 @@ private:
     }
 
     std::string GetSampler(const Sampler& sampler) const {
-        return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
+        return AppendSuffix(sampler.index, "sampler");
     }
 
     std::string GetImage(const Image& image) const {
-        return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
+        return AppendSuffix(image.index, "image");
     }
 
     std::string AppendSuffix(u32 index, std::string_view name) const {
@@ -2623,15 +2780,6 @@ private:
         return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
     }
 
-    bool IsRenderTargetEnabled(u32 render_target) const {
-        for (u32 component = 0; component < 4; ++component) {
-            if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
     const Device& device;
     const ShaderIR& ir;
     const Registry& registry;
@@ -2639,6 +2787,7 @@ private:
     const std::string_view identifier;
     const std::string_view suffix;
     const Header header;
+    const bool use_unified_uniforms;
     std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     ShaderWriter code;
@@ -2834,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() {
 
 } // Anonymous namespace
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
     ShaderEntries entries;
     for (const auto& cbuf : ir.GetConstantBuffers()) {
         entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2855,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
         entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
     }
     entries.shader_length = ir.GetLength();
+    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
     return entries;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e7dbd810c..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -33,36 +33,19 @@ public:
     }
 
 private:
-    u32 index{};
+    u32 index = 0;
 };
 
-class GlobalMemoryEntry {
-public:
-    explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
+struct GlobalMemoryEntry {
+    constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read,
+                                         bool is_written)
         : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
                                                                                   is_written} {}
 
-    u32 GetCbufIndex() const {
-        return cbuf_index;
-    }
-
-    u32 GetCbufOffset() const {
-        return cbuf_offset;
-    }
-
-    bool IsRead() const {
-        return is_read;
-    }
-
-    bool IsWritten() const {
-        return is_written;
-    }
-
-private:
-    u32 cbuf_index{};
-    u32 cbuf_offset{};
-    bool is_read{};
-    bool is_written{};
+    u32 cbuf_index = 0;
+    u32 cbuf_offset = 0;
+    bool is_read = false;
+    bool is_written = false;
 };
 
 struct ShaderEntries {
@@ -70,11 +53,13 @@ struct ShaderEntries {
     std::vector<GlobalMemoryEntry> global_memory_entries;
     std::vector<SamplerEntry> samplers;
     std::vector<ImageEntry> images;
-    u32 clip_distances{};
     std::size_t shader_length{};
+    u32 clip_distances{};
+    bool use_unified_uniforms{};
 };
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                          Tegra::Engines::ShaderType stage);
 
 std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
                             const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..70dd0c3c6 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
 
 namespace {
 
+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;
 
 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
     u32 value = 0;
 };
 
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
     u32 cbuf = 0;
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
@@ -63,7 +73,7 @@ ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default;
 
 ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default;
 
-bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
+bool ShaderDiskCacheEntry::Load(Common::FS::IOFile& file) {
     if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
         return false;
     }
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     u32 texture_handler_size_value;
     u32 num_keys;
     u32 num_bound_samplers;
+    u32 num_separate_samplers;
     u32 num_bindless_samplers;
     if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
         file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
         file.ReadArray(&texture_handler_size_value, 1) != 1 ||
         file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
         file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
         file.ReadArray(&num_bindless_samplers, 1) != 1) {
         return false;
     }
@@ -101,29 +113,38 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     }
 
     std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
     if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
         file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
             flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
         file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
             flat_bindless_samplers.size()) {
         return false;
     }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
+    }
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
     }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
     }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
     }
 
     return true;
 }
 
-bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
+bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const {
     if (file.WriteObject(static_cast<u32>(type)) != 1 ||
         file.WriteObject(static_cast<u32>(code.size())) != 1 ||
         file.WriteObject(static_cast<u32>(code_b.size())) != 1) {
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
         file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
         return false;
     }
@@ -152,48 +174,64 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
     }
 
-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
     flat_bound_samplers.reserve(bound_samplers.size());
     for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
     }
 
-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
+    }
+
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
     flat_bindless_samplers.reserve(bindless_samplers.size());
     for (const auto& [address, sampler] : bindless_samplers) {
         flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
     }
 
     return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
            file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
                flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
            file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
                flat_bindless_samplers.size();
 }
 
-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default;
 
 ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
 
+void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) {
+    title_id = title_id_;
+}
+
 std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
     // Skip games without title id
-    const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
-    if (!Settings::values.use_disk_shader_cache || !has_title_id) {
-        return {};
+    const bool has_title_id = title_id != 0;
+    if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) {
+        return std::nullopt;
     }
 
-    FileUtil::IOFile file(GetTransferablePath(), "rb");
+    Common::FS::IOFile file(GetTransferablePath(), "rb");
     if (!file.IsOpen()) {
         LOG_INFO(Render_OpenGL, "No transferable shader cache found");
         is_usable = true;
-        return {};
+        return std::nullopt;
     }
 
     u32 version{};
     if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
         LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it");
-        return {};
+        return std::nullopt;
     }
 
     if (version < NativeVersion) {
@@ -201,12 +239,12 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran
         file.Close();
         InvalidateTransferable();
         is_usable = true;
-        return {};
+        return std::nullopt;
     }
     if (version > NativeVersion) {
         LOG_WARNING(Render_OpenGL, "Transferable shader cache was generated with a newer version "
                                    "of the emulator, skipping");
-        return {};
+        return std::nullopt;
     }
 
     // Version is valid, load the shaders
@@ -215,7 +253,7 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran
         ShaderDiskCacheEntry& entry = entries.emplace_back();
         if (!entry.Load(file)) {
             LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping");
-            return {};
+            return std::nullopt;
         }
     }
 
@@ -228,7 +266,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled()
         return {};
     }
 
-    FileUtil::IOFile file(GetPrecompiledPath(), "rb");
+    Common::FS::IOFile file(GetPrecompiledPath(), "rb");
     if (!file.IsOpen()) {
         LOG_INFO(Render_OpenGL, "No precompiled shader cache found");
         return {};
@@ -245,7 +283,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled()
 }
 
 std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(
-    FileUtil::IOFile& file) {
+    Common::FS::IOFile& file) {
     // Read compressed file from disk and decompress to virtual precompiled cache file
     std::vector<u8> compressed(file.GetSize());
     file.ReadBytes(compressed.data(), compressed.size());
@@ -256,12 +294,12 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo
     ShaderCacheVersionHash file_hash{};
     if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) {
         precompiled_cache_virtual_file_offset = 0;
-        return {};
+        return std::nullopt;
     }
     if (GetShaderCacheVersionHash() != file_hash) {
         LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator");
         precompiled_cache_virtual_file_offset = 0;
-        return {};
+        return std::nullopt;
     }
 
     std::vector<ShaderDiskCachePrecompiled> entries;
@@ -271,19 +309,19 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo
         if (!LoadObjectFromPrecompiled(entry.unique_identifier) ||
             !LoadObjectFromPrecompiled(entry.binary_format) ||
             !LoadObjectFromPrecompiled(binary_size)) {
-            return {};
+            return std::nullopt;
         }
 
         entry.binary.resize(binary_size);
         if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) {
-            return {};
+            return std::nullopt;
         }
     }
     return entries;
 }
 
 void ShaderDiskCacheOpenGL::InvalidateTransferable() {
-    if (!FileUtil::Delete(GetTransferablePath())) {
+    if (!Common::FS::Delete(GetTransferablePath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}",
                   GetTransferablePath());
     }
@@ -294,7 +332,7 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
     // Clear virtaul precompiled cache file
     precompiled_cache_virtual_file.Resize(0);
 
-    if (!FileUtil::Delete(GetPrecompiledPath())) {
+    if (!Common::FS::Delete(GetPrecompiledPath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath());
     }
 }
@@ -310,7 +348,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) {
         return;
     }
 
-    FileUtil::IOFile file = AppendTransferableFile();
+    Common::FS::IOFile file = AppendTransferableFile();
     if (!file.IsOpen()) {
         return;
     }
@@ -352,15 +390,15 @@ void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint progra
     }
 }
 
-FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
+Common::FS::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
     if (!EnsureDirectories()) {
         return {};
     }
 
     const auto transferable_path{GetTransferablePath()};
-    const bool existed = FileUtil::Exists(transferable_path);
+    const bool existed = Common::FS::Exists(transferable_path);
 
-    FileUtil::IOFile file(transferable_path, "ab");
+    Common::FS::IOFile file(transferable_path, "ab");
     if (!file.IsOpen()) {
         LOG_ERROR(Render_OpenGL, "Failed to open transferable cache in path={}", transferable_path);
         return {};
@@ -392,7 +430,7 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
         Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size());
 
     const auto precompiled_path{GetPrecompiledPath()};
-    FileUtil::IOFile file(precompiled_path, "wb");
+    Common::FS::IOFile file(precompiled_path, "wb");
 
     if (!file.IsOpen()) {
         LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path);
@@ -406,24 +444,24 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
 
 bool ShaderDiskCacheOpenGL::EnsureDirectories() const {
     const auto CreateDir = [](const std::string& dir) {
-        if (!FileUtil::CreateDir(dir)) {
+        if (!Common::FS::CreateDir(dir)) {
             LOG_ERROR(Render_OpenGL, "Failed to create directory={}", dir);
             return false;
         }
         return true;
     };
 
-    return CreateDir(FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir)) &&
+    return CreateDir(Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir)) &&
            CreateDir(GetBaseDir()) && CreateDir(GetTransferableDir()) &&
            CreateDir(GetPrecompiledDir());
 }
 
 std::string ShaderDiskCacheOpenGL::GetTransferablePath() const {
-    return FileUtil::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
+    return Common::FS::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
 }
 
 std::string ShaderDiskCacheOpenGL::GetPrecompiledPath() const {
-    return FileUtil::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
+    return Common::FS::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
 }
 
 std::string ShaderDiskCacheOpenGL::GetTransferableDir() const {
@@ -435,11 +473,11 @@ std::string ShaderDiskCacheOpenGL::GetPrecompiledDir() const {
 }
 
 std::string ShaderDiskCacheOpenGL::GetBaseDir() const {
-    return FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir) + DIR_SEP "opengl";
+    return Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir) + DIR_SEP "opengl";
 }
 
 std::string ShaderDiskCacheOpenGL::GetTitleID() const {
-    return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID());
+    return fmt::format("{:016X}", title_id);
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..aef841c1d 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -21,11 +21,7 @@
 #include "video_core/engines/shader_type.h"
 #include "video_core/shader/registry.h"
 
-namespace Core {
-class System;
-}
-
-namespace FileUtil {
+namespace Common::FS {
 class IOFile;
 }
 
@@ -38,9 +34,9 @@ struct ShaderDiskCacheEntry {
     ShaderDiskCacheEntry();
     ~ShaderDiskCacheEntry();
 
-    bool Load(FileUtil::IOFile& file);
+    bool Load(Common::FS::IOFile& file);
 
-    bool Save(FileUtil::IOFile& file) const;
+    bool Save(Common::FS::IOFile& file) const;
 
     bool HasProgramA() const {
         return !code.empty() && !code_b.empty();
@@ -57,6 +53,7 @@ struct ShaderDiskCacheEntry {
     VideoCommon::Shader::ComputeInfo compute_info;
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };
 
@@ -69,9 +66,12 @@ struct ShaderDiskCachePrecompiled {
 
 class ShaderDiskCacheOpenGL {
 public:
-    explicit ShaderDiskCacheOpenGL(Core::System& system);
+    explicit ShaderDiskCacheOpenGL();
     ~ShaderDiskCacheOpenGL();
 
+    /// Binds a title ID for all future operations.
+    void BindTitleID(u64 title_id);
+
     /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
     std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();
 
@@ -96,10 +96,10 @@ public:
 private:
     /// Loads the transferable cache. Returns empty on failure.
     std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile(
-        FileUtil::IOFile& file);
+        Common::FS::IOFile& file);
 
     /// Opens current game's transferable file and write it's header if it doesn't exist
-    FileUtil::IOFile AppendTransferableFile() const;
+    Common::FS::IOFile AppendTransferableFile() const;
 
     /// Save precompiled header to precompiled_cache_in_memory
     void SavePrecompiledHeaderToVirtualPrecompiledCache();
@@ -156,8 +156,6 @@ private:
         return LoadArrayFromPrecompiled(&object, 1);
     }
 
-    Core::System& system;
-
     // Stores whole precompiled cache which will be read from or saved to the precompiled chache
     // file
     FileSys::VectorVfsFile precompiled_cache_virtual_file;
@@ -167,8 +165,11 @@ private:
     // Stored transferable shaders
     std::unordered_set<u64> stored_transferable;
 
+    /// Title ID to operate on
+    u64 title_id = 0;
+
     // The cache has been loaded at boot
-    bool is_usable{};
+    bool is_usable = false;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 9c7b0adbd..691c6c79b 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,47 +6,124 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 
-namespace OpenGL::GLShader {
+namespace OpenGL {
 
-ProgramManager::ProgramManager() = default;
+namespace {
+
+void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) {
+    if (current == old) {
+        return;
+    }
+    if (current == 0) {
+        if (enabled) {
+            enabled = false;
+            glDisable(stage);
+        }
+        return;
+    }
+    if (!enabled) {
+        enabled = true;
+        glEnable(stage);
+    }
+    glBindProgramARB(stage, current);
+}
+
+} // Anonymous namespace
+
+ProgramManager::ProgramManager(const Device& device)
+    : use_assembly_programs{device.UseAssemblyShaders()} {
+    if (use_assembly_programs) {
+        glEnable(GL_COMPUTE_PROGRAM_NV);
+    } else {
+        graphics_pipeline.Create();
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
 
 ProgramManager::~ProgramManager() = default;
 
-void ProgramManager::Create() {
-    graphics_pipeline.Create();
-    glBindProgramPipeline(graphics_pipeline.handle);
+void ProgramManager::BindCompute(GLuint program) {
+    if (use_assembly_programs) {
+        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+    } else {
+        is_graphics_bound = false;
+        glUseProgram(program);
+    }
 }
 
 void ProgramManager::BindGraphicsPipeline() {
+    if (!use_assembly_programs) {
+        UpdateSourcePrograms();
+    }
+}
+
+void ProgramManager::BindHostPipeline(GLuint pipeline) {
+    if (use_assembly_programs) {
+        if (geometry_enabled) {
+            geometry_enabled = false;
+            old_state.geometry = 0;
+            glDisable(GL_GEOMETRY_PROGRAM_NV);
+        }
+    } else {
+        if (!is_graphics_bound) {
+            glUseProgram(0);
+        }
+    }
+    glBindProgramPipeline(pipeline);
+}
+
+void ProgramManager::RestoreGuestPipeline() {
+    if (use_assembly_programs) {
+        glBindProgramPipeline(0);
+    } else {
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
+
+void ProgramManager::UseVertexShader(GLuint program) {
+    if (use_assembly_programs) {
+        BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled);
+    }
+    current_state.vertex = program;
+}
+
+void ProgramManager::UseGeometryShader(GLuint program) {
+    if (use_assembly_programs) {
+        BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled);
+    }
+    current_state.geometry = program;
+}
+
+void ProgramManager::UseFragmentShader(GLuint program) {
+    if (use_assembly_programs) {
+        BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled);
+    }
+    current_state.fragment = program;
+}
+
+void ProgramManager::UpdateSourcePrograms() {
     if (!is_graphics_bound) {
         is_graphics_bound = true;
         glUseProgram(0);
     }
 
-    // Avoid updating the pipeline when values have no changed
-    if (old_state == current_state) {
-        return;
-    }
-
-    // Workaround for AMD bug
-    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
-                                            GL_FRAGMENT_SHADER_BIT};
     const GLuint handle = graphics_pipeline.handle;
-    glUseProgramStages(handle, all_used_stages, 0);
-    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
-    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
-    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+    const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        glUseProgramStages(handle, stage, current);
+    };
+    update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
+    update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
 
     old_state = current_state;
 }
 
-void ProgramManager::BindComputeShader(GLuint program) {
-    is_graphics_bound = false;
-    glUseProgram(program);
-}
-
 void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
     const auto& regs = maxwell.regs;
 
@@ -54,4 +131,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
     y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
 }
 
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index d2e47f2a9..950e0dfcb 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 
-namespace OpenGL::GLShader {
+namespace OpenGL {
+
+class Device;
 
 /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
 /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,47 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
 
 class ProgramManager {
 public:
-    explicit ProgramManager();
+    explicit ProgramManager(const Device& device);
     ~ProgramManager();
 
-    void Create();
+    /// Binds a compute program
+    void BindCompute(GLuint program);
 
-    /// Updates the graphics pipeline and binds it.
+    /// Updates bound programs.
     void BindGraphicsPipeline();
 
-    /// Binds a compute shader.
-    void BindComputeShader(GLuint program);
-
-    void UseVertexShader(GLuint program) {
-        current_state.vertex_shader = program;
-    }
+    /// Binds an OpenGL pipeline object unsynchronized with the guest state.
+    void BindHostPipeline(GLuint pipeline);
 
-    void UseGeometryShader(GLuint program) {
-        current_state.geometry_shader = program;
-    }
+    /// Rewinds BindHostPipeline state changes.
+    void RestoreGuestPipeline();
 
-    void UseFragmentShader(GLuint program) {
-        current_state.fragment_shader = program;
-    }
+    void UseVertexShader(GLuint program);
+    void UseGeometryShader(GLuint program);
+    void UseFragmentShader(GLuint program);
 
 private:
     struct PipelineState {
-        bool operator==(const PipelineState& rhs) const noexcept {
-            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
-                   geometry_shader == rhs.geometry_shader;
-        }
-
-        bool operator!=(const PipelineState& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-
-        GLuint vertex_shader = 0;
-        GLuint fragment_shader = 0;
-        GLuint geometry_shader = 0;
+        GLuint vertex = 0;
+        GLuint geometry = 0;
+        GLuint fragment = 0;
     };
 
+    /// Update GLSL programs.
+    void UpdateSourcePrograms();
+
     OGLPipeline graphics_pipeline;
-    OGLPipeline compute_pipeline;
+
     PipelineState current_state;
     PipelineState old_state;
+
+    bool use_assembly_programs = false;
+
     bool is_graphics_bound = true;
+
+    bool vertex_enabled = false;
+    bool geometry_enabled = false;
+    bool fragment_enabled = false;
 };
 
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 9e74eda0d..4bf0d6090 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <string_view>
 #include <vector>
 #include <glad/glad.h>
 #include "common/assert.h"
@@ -11,7 +12,8 @@
 namespace OpenGL::GLShader {
 
 namespace {
-const char* GetStageDebugName(GLenum type) {
+
+std::string_view StageDebugName(GLenum type) {
     switch (type) {
     case GL_VERTEX_SHADER:
         return "vertex";
@@ -25,12 +27,17 @@ const char* GetStageDebugName(GLenum type) {
     UNIMPLEMENTED();
     return "unknown";
 }
+
 } // Anonymous namespace
 
-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type = GetStageDebugName(type);
+GLuint LoadShader(std::string_view source, GLenum type) {
+    const std::string_view debug_type = StageDebugName(type);
     const GLuint shader_id = glCreateShader(type);
-    glShaderSource(shader_id, 1, &source, nullptr);
+
+    const GLchar* source_string = source.data();
+    const GLint source_length = static_cast<GLint>(source.size());
+
+    glShaderSource(shader_id, 1, &source_string, &source_length);
     LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
     glCompileShader(shader_id);
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 03b7548c2..1b770532e 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -38,7 +38,7 @@ void LogShaderSource(T... shaders) {
  * @param source String of the GLSL shader program
  * @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER)
  */
-GLuint LoadShader(const char* source, GLenum type);
+GLuint LoadShader(std::string_view source, GLenum type);
 
 /**
  * Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader)
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index d24fad3de..6bcf831f2 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -214,10 +214,8 @@ void SetupDirtyMisc(Tables& tables) {
 
 } // Anonymous namespace
 
-StateTracker::StateTracker(Core::System& system) : system{system} {}
-
-void StateTracker::Initialize() {
-    auto& dirty = system.GPU().Maxwell3D().dirty;
+StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
+    auto& dirty = gpu.Maxwell3D().dirty;
     auto& tables = dirty.tables;
     SetupDirtyRenderTargets(tables);
     SetupDirtyColorMasks(tables);
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 0f823288e..9d127548f 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -13,8 +13,8 @@
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/maxwell_3d.h"
 
-namespace Core {
-class System;
+namespace Tegra {
+class GPU;
 }
 
 namespace OpenGL {
@@ -90,9 +90,7 @@ static_assert(Last <= std::numeric_limits<u8>::max());
 
 class StateTracker {
 public:
-    explicit StateTracker(Core::System& system);
-
-    void Initialize();
+    explicit StateTracker(Tegra::GPU& gpu);
 
     void BindIndexBuffer(GLuint new_index_buffer) {
         if (index_buffer == new_index_buffer) {
@@ -103,7 +101,6 @@ public:
     }
 
     void NotifyScreenDrawVertexArray() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::VertexFormats] = true;
         flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
         flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
@@ -117,98 +114,81 @@ public:
     }
 
     void NotifyPolygonModes() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::PolygonModes] = true;
         flags[OpenGL::Dirty::PolygonModeFront] = true;
         flags[OpenGL::Dirty::PolygonModeBack] = true;
     }
 
     void NotifyViewport0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::Viewports] = true;
         flags[OpenGL::Dirty::Viewport0] = true;
     }
 
     void NotifyScissor0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::Scissors] = true;
         flags[OpenGL::Dirty::Scissor0] = true;
     }
 
     void NotifyColorMask0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::ColorMasks] = true;
         flags[OpenGL::Dirty::ColorMask0] = true;
     }
 
     void NotifyBlend0() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::BlendStates] = true;
         flags[OpenGL::Dirty::BlendState0] = true;
     }
 
     void NotifyFramebuffer() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[VideoCommon::Dirty::RenderTargets] = true;
     }
 
     void NotifyFrontFace() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::FrontFace] = true;
     }
 
     void NotifyCullTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::CullTest] = true;
     }
 
     void NotifyDepthMask() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::DepthMask] = true;
     }
 
     void NotifyDepthTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::DepthTest] = true;
     }
 
     void NotifyStencilTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::StencilTest] = true;
     }
 
     void NotifyPolygonOffset() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::PolygonOffset] = true;
     }
 
     void NotifyRasterizeEnable() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::RasterizeEnable] = true;
     }
 
     void NotifyFramebufferSRGB() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::FramebufferSRGB] = true;
     }
 
     void NotifyLogicOp() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::LogicOp] = true;
     }
 
     void NotifyClipControl() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::ClipControl] = true;
     }
 
     void NotifyAlphaTest() {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         flags[OpenGL::Dirty::AlphaTest] = true;
     }
 
 private:
-    Core::System& system;
+    Tegra::Engines::Maxwell3D::DirtyState::Flags& flags;
 
     GLuint index_buffer = 0;
 };
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..887995cf4 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
     ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
     /*
      * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
      * and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
 
     void Unmap(GLsizeiptr size);
 
+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
+        return buffer_size;
+    }
+
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 2729d1265..a863ef218 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,96 +35,109 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
 namespace {
 
 struct FormatTuple {
-    GLint internal_format;
+    GLenum internal_format;
     GLenum format = GL_NONE;
     GLenum type = GL_NONE;
 };
 
 constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},             // ABGR8U
-    {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE},                           // ABGR8S
-    {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE},              // ABGR8UI
-    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV},             // B5G6R5U
-    {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV},       // A2B10G10R10U
-    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV},         // A1B5G5R5U
-    {GL_R8, GL_RED, GL_UNSIGNED_BYTE},                            // R8U
-    {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE},                  // R8UI
-    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT},                         // RGBA16F
-    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT},                      // RGBA16U
-    {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT},                         // RGBA16S
-    {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT},            // RGBA16UI
-    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // R11FG11FB10F
-    {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT},              // RGBA32UI
-    {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT},                           // DXT1
-    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT},                           // DXT23
-    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT},                           // DXT45
-    {GL_COMPRESSED_RED_RGTC1},                                    // DXN1
-    {GL_COMPRESSED_RG_RGTC2},                                     // DXN2UNORM
-    {GL_COMPRESSED_SIGNED_RG_RGTC2},                              // DXN2SNORM
-    {GL_COMPRESSED_RGBA_BPTC_UNORM},                              // BC7U
-    {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT},                      // BC6H_UF16
-    {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT},                        // BC6H_SF16
-    {GL_COMPRESSED_RGBA_ASTC_4x4_KHR},                            // ASTC_2D_4X4
-    {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE},                        // BGRA8
-    {GL_RGBA32F, GL_RGBA, GL_FLOAT},                              // RGBA32F
-    {GL_RG32F, GL_RG, GL_FLOAT},                                  // RG32F
-    {GL_R32F, GL_RED, GL_FLOAT},                                  // R32F
-    {GL_R16F, GL_RED, GL_HALF_FLOAT},                             // R16F
-    {GL_R16, GL_RED, GL_UNSIGNED_SHORT},                          // R16U
-    {GL_R16_SNORM, GL_RED, GL_SHORT},                             // R16S
-    {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT},                // R16UI
-    {GL_R16I, GL_RED_INTEGER, GL_SHORT},                          // R16I
-    {GL_RG16, GL_RG, GL_UNSIGNED_SHORT},                          // RG16
-    {GL_RG16F, GL_RG, GL_HALF_FLOAT},                             // RG16F
-    {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT},                // RG16UI
-    {GL_RG16I, GL_RG_INTEGER, GL_SHORT},                          // RG16I
-    {GL_RG16_SNORM, GL_RG, GL_SHORT},                             // RG16S
-    {GL_RGB32F, GL_RGB, GL_FLOAT},                                // RGB32F
-    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},      // RGBA8_SRGB
-    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE},                            // RG8U
-    {GL_RG8_SNORM, GL_RG, GL_BYTE},                               // RG8S
-    {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT},                  // RG32UI
-    {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT},                          // RGBX16F
-    {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT},                  // R32UI
-    {GL_R32I, GL_RED_INTEGER, GL_INT},                            // R32I
-    {GL_COMPRESSED_RGBA_ASTC_8x8_KHR},                            // ASTC_2D_8X8
-    {GL_COMPRESSED_RGBA_ASTC_8x5_KHR},                            // ASTC_2D_8X5
-    {GL_COMPRESSED_RGBA_ASTC_5x4_KHR},                            // ASTC_2D_5X4
-    {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE},                 // BGRA8
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},                 // A8B8G8R8_UNORM
+    {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE},                               // A8B8G8R8_SNORM
+    {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE},                            // A8B8G8R8_SINT
+    {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE},                  // A8B8G8R8_UINT
+    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},                     // R5G6B5_UNORM
+    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV},                 // B5G6R5_UNORM
+    {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1R5G5B5_UNORM
+    {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV},           // A2B10G10R10_UNORM
+    {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT
+    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1B5G5R5_UNORM
+    {GL_R8, GL_RED, GL_UNSIGNED_BYTE},                                // R8_UNORM
+    {GL_R8_SNORM, GL_RED, GL_BYTE},                                   // R8_SNORM
+    {GL_R8I, GL_RED_INTEGER, GL_BYTE},                                // R8_SINT
+    {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE},                      // R8_UINT
+    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT},                             // R16G16B16A16_FLOAT
+    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT},                          // R16G16B16A16_UNORM
+    {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT},                             // R16G16B16A16_SNORM
+    {GL_RGBA16I, GL_RGBA_INTEGER, GL_SHORT},                          // R16G16B16A16_SINT
+    {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT},                // R16G16B16A16_UINT
+    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV},     // B10G11R11_FLOAT
+    {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT},                  // R32G32B32A32_UINT
+    {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT},                               // BC1_RGBA_UNORM
+    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT},                               // BC2_UNORM
+    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT},                               // BC3_UNORM
+    {GL_COMPRESSED_RED_RGTC1},                                        // BC4_UNORM
+    {GL_COMPRESSED_SIGNED_RED_RGTC1},                                 // BC4_SNORM
+    {GL_COMPRESSED_RG_RGTC2},                                         // BC5_UNORM
+    {GL_COMPRESSED_SIGNED_RG_RGTC2},                                  // BC5_SNORM
+    {GL_COMPRESSED_RGBA_BPTC_UNORM},                                  // BC7_UNORM
+    {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT},                          // BC6H_UFLOAT
+    {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT},                            // BC6H_SFLOAT
+    {GL_COMPRESSED_RGBA_ASTC_4x4_KHR},                                // ASTC_2D_4X4_UNORM
+    {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE},                            // B8G8R8A8_UNORM
+    {GL_RGBA32F, GL_RGBA, GL_FLOAT},                                  // R32G32B32A32_FLOAT
+    {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT},                            // R32G32B32A32_SINT
+    {GL_RG32F, GL_RG, GL_FLOAT},                                      // R32G32_FLOAT
+    {GL_RG32I, GL_RG_INTEGER, GL_INT},                                // R32G32_SINT
+    {GL_R32F, GL_RED, GL_FLOAT},                                      // R32_FLOAT
+    {GL_R16F, GL_RED, GL_HALF_FLOAT},                                 // R16_FLOAT
+    {GL_R16, GL_RED, GL_UNSIGNED_SHORT},                              // R16_UNORM
+    {GL_R16_SNORM, GL_RED, GL_SHORT},                                 // R16_SNORM
+    {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT},                    // R16_UINT
+    {GL_R16I, GL_RED_INTEGER, GL_SHORT},                              // R16_SINT
+    {GL_RG16, GL_RG, GL_UNSIGNED_SHORT},                              // R16G16_UNORM
+    {GL_RG16F, GL_RG, GL_HALF_FLOAT},                                 // R16G16_FLOAT
+    {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT},                    // R16G16_UINT
+    {GL_RG16I, GL_RG_INTEGER, GL_SHORT},                              // R16G16_SINT
+    {GL_RG16_SNORM, GL_RG, GL_SHORT},                                 // R16G16_SNORM
+    {GL_RGB32F, GL_RGB, GL_FLOAT},                                    // R32G32B32_FLOAT
+    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},          // A8B8G8R8_SRGB
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE},                                // R8G8_UNORM
+    {GL_RG8_SNORM, GL_RG, GL_BYTE},                                   // R8G8_SNORM
+    {GL_RG8I, GL_RG_INTEGER, GL_BYTE},                                // R8G8_SINT
+    {GL_RG8UI, GL_RG_INTEGER, GL_UNSIGNED_BYTE},                      // R8G8_UINT
+    {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT},                      // R32G32_UINT
+    {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT},                              // R16G16B16X16_FLOAT
+    {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT},                      // R32_UINT
+    {GL_R32I, GL_RED_INTEGER, GL_INT},                                // R32_SINT
+    {GL_COMPRESSED_RGBA_ASTC_8x8_KHR},                                // ASTC_2D_8X8_UNORM
+    {GL_COMPRESSED_RGBA_ASTC_8x5_KHR},                                // ASTC_2D_8X5_UNORM
+    {GL_COMPRESSED_RGBA_ASTC_5x4_KHR},                                // ASTC_2D_5X4_UNORM
+    {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE},                     // B8G8R8A8_UNORM
     // Compressed sRGB formats
-    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT},           // DXT1_SRGB
-    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT},           // DXT23_SRGB
-    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},           // DXT45_SRGB
-    {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM},              // BC7U_SRGB
-    {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // R4G4B4A4U
+    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT},           // BC1_RGBA_SRGB
+    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT},           // BC2_SRGB
+    {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},           // BC3_SRGB
+    {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM},              // BC7_SRGB
+    {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR},          // ASTC_2D_4X4_SRGB
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR},          // ASTC_2D_8X8_SRGB
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR},          // ASTC_2D_8X5_SRGB
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR},          // ASTC_2D_5X4_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_5x5_KHR},                  // ASTC_2D_5X5
+    {GL_COMPRESSED_RGBA_ASTC_5x5_KHR},                  // ASTC_2D_5X5_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR},          // ASTC_2D_5X5_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_10x8_KHR},                 // ASTC_2D_10X8
+    {GL_COMPRESSED_RGBA_ASTC_10x8_KHR},                 // ASTC_2D_10X8_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR},         // ASTC_2D_10X8_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_6x6_KHR},                  // ASTC_2D_6X6
+    {GL_COMPRESSED_RGBA_ASTC_6x6_KHR},                  // ASTC_2D_6X6_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR},          // ASTC_2D_6X6_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_10x10_KHR},                // ASTC_2D_10X10
+    {GL_COMPRESSED_RGBA_ASTC_10x10_KHR},                // ASTC_2D_10X10_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR},        // ASTC_2D_10X10_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_12x12_KHR},                // ASTC_2D_12X12
+    {GL_COMPRESSED_RGBA_ASTC_12x12_KHR},                // ASTC_2D_12X12_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR},        // ASTC_2D_12X12_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_8x6_KHR},                  // ASTC_2D_8X6
+    {GL_COMPRESSED_RGBA_ASTC_8x6_KHR},                  // ASTC_2D_8X6_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR},          // ASTC_2D_8X6_SRGB
-    {GL_COMPRESSED_RGBA_ASTC_6x5_KHR},                  // ASTC_2D_6X5
+    {GL_COMPRESSED_RGBA_ASTC_6x5_KHR},                  // ASTC_2D_6X5_UNORM
     {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR},          // ASTC_2D_6X5_SRGB
-    {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV},  // E5B9G9R9F
+    {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV},  // E5B9G9R9_FLOAT
 
     // Depth formats
-    {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT},         // Z32F
-    {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // Z16
+    {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT},         // D32_FLOAT
+    {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM
 
     // DepthStencil formats
-    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8},               // Z24S8
-    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8},               // S8Z24
-    {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // Z32FS8
+    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT
+    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM
+    {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL,
+     GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT
 }};
 
 const FormatTuple& GetFormatTuple(PixelFormat pixel_format) {
@@ -177,10 +190,10 @@ GLint GetSwizzleSource(SwizzleSource source) {
 
 GLenum GetComponent(PixelFormat format, bool is_first) {
     switch (format) {
-    case PixelFormat::Z24S8:
-    case PixelFormat::Z32FS8:
+    case PixelFormat::D24_UNORM_S8_UINT:
+    case PixelFormat::D32_FLOAT_S8_UINT:
         return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
-    case PixelFormat::S8Z24:
+    case PixelFormat::S8_UINT_D24_UNORM:
         return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
     default:
         UNREACHABLE();
@@ -237,6 +250,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
     return texture;
 }
 
+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+                            SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace
 
 CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -256,9 +275,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
     target = GetTextureTarget(params.target);
     texture = CreateTexture(params, target, internal_format, texture_buffer);
     DecorateSurfaceName();
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
-        true);
+
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+
+    main_view =
+        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -379,8 +403,8 @@ void CachedSurface::DecorateSurfaceName() {
     LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName());
 }
 
-void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
-    LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) {
+    LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
 }
 
 View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -396,32 +420,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 }
 
 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
-                                     const bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
-    target = GetTextureTarget(params.target);
-    format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+                                     bool is_proxy)
+    : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
+      target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
     if (!is_proxy) {
-        texture_view = CreateTextureView();
+        main_view = CreateTextureView();
     }
-    swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
 }
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
     ASSERT(params.num_levels == 1);
 
+    if (params.target == SurfaceTarget::Texture3D) {
+        if (params.num_layers > 1) {
+            ASSERT(params.base_layer == 0);
+            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+        } else {
+            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+                                   params.base_level, params.base_layer);
+        }
+        return;
+    }
+
     if (params.num_layers > 1) {
-        // Layered framebuffer attachments
         UNIMPLEMENTED_IF(params.base_layer != 0);
-
-        switch (params.target) {
-        case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), 0);
-            break;
-        default:
-            UNIMPLEMENTED();
-        }
+        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
         return;
     }
 
@@ -429,16 +454,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     const GLuint texture = surface.GetTexture();
     switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture1DArray:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
                                   params.base_layer);
         break;
     default:
@@ -446,44 +471,73 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     }
 }
 
-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
                                      SwizzleSource z_source, SwizzleSource w_source) {
-    u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (new_swizzle == swizzle)
-        return;
-    swizzle = new_swizzle;
-    const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
-                                   GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
-    const GLuint handle = GetTexture();
-    const PixelFormat format = surface.GetSurfaceParams().pixel_format;
-    switch (format) {
-    case PixelFormat::Z24S8:
-    case PixelFormat::Z32FS8:
-    case PixelFormat::S8Z24:
-        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+    if (GetSurfaceParams().IsBuffer()) {
+        return GetTexture();
+    }
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (current_swizzle == new_swizzle) {
+        return current_view;
+    }
+    current_swizzle = new_swizzle;
+
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+    OGLTextureView& view = entry->second;
+    if (!is_cache_miss) {
+        current_view = view.handle;
+        return view.handle;
+    }
+    view = CreateTextureView();
+    current_view = view.handle;
+
+    std::array swizzle{x_source, y_source, z_source, w_source};
+
+    switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
+    case PixelFormat::D24_UNORM_S8_UINT:
+    case PixelFormat::D32_FLOAT_S8_UINT:
+    case PixelFormat::S8_UINT_D24_UNORM:
+        UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+        glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
                             GetComponent(format, x_source == SwizzleSource::R));
-        break;
-    default:
-        glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+
+        // Make sure we sample the first component
+        std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+            return value == SwizzleSource::G ? SwizzleSource::R : value;
+        });
+        [[fallthrough]];
+    default: {
+        const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+                                       GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+        glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
         break;
     }
+    }
+    return view.handle;
 }
 
 OGLTextureView CachedSurfaceView::CreateTextureView() const {
     OGLTextureView texture_view;
     texture_view.Create();
 
-    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
-                  params.num_levels, params.base_layer, params.num_layers);
+    if (target == GL_TEXTURE_3D) {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, 0, 1);
+    } else {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, params.base_layer, params.num_layers);
+    }
     ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
 
     return texture_view;
 }
 
-TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
-                                       VideoCore::RasterizerInterface& rasterizer,
-                                       const Device& device, StateTracker& state_tracker)
-    : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} {
+TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+                                       Tegra::Engines::Maxwell3D& maxwell3d,
+                                       Tegra::MemoryManager& gpu_memory, const Device& device,
+                                       StateTracker& state_tracker_)
+    : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{
+                                                                                 state_tracker_} {
     src_framebuffer.Create();
     dst_framebuffer.Create();
 }
@@ -517,8 +571,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                    const Tegra::Engines::Fermi2D::Config& copy_config) {
     const auto& src_params{src_view->GetSurfaceParams()};
     const auto& dst_params{dst_view->GetSurfaceParams()};
-    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
-    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+    UNIMPLEMENTED_IF(src_params.depth != 1);
+    UNIMPLEMENTED_IF(dst_params.depth != 1);
 
     state_tracker.NotifyScissor0();
     state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 02d9981a1..7787134fc 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,15 +80,17 @@ public:
     explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
     ~CachedSurfaceView();
 
-    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
-    void Attach(GLenum attachment, GLenum target) const;
+    /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+    /// @param attachment   Attachment to bind textures to
+    /// @param fb_target    Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+    void Attach(GLenum attachment, GLenum fb_target) const;
 
-    void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
+    GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                       Tegra::Texture::SwizzleSource y_source,
                       Tegra::Texture::SwizzleSource z_source,
                       Tegra::Texture::SwizzleSource w_source);
 
-    void DecorateViewName(GPUVAddr gpu_addr, std::string prefix);
+    void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix);
 
     void MarkAsModified(u64 tick) {
         surface.MarkAsModified(true, tick);
@@ -98,7 +100,7 @@ public:
         if (is_proxy) {
             return surface.GetTexture();
         }
-        return texture_view.handle;
+        return main_view.handle;
     }
 
     GLenum GetFormat() const {
@@ -110,29 +112,27 @@ public:
     }
 
 private:
-    u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                      Tegra::Texture::SwizzleSource y_source,
-                      Tegra::Texture::SwizzleSource z_source,
-                      Tegra::Texture::SwizzleSource w_source) const {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
     OGLTextureView CreateTextureView() const;
 
     CachedSurface& surface;
-    GLenum target{};
-    GLenum format{};
+    const GLenum format;
+    const GLenum target;
+    const bool is_proxy;
+
+    std::unordered_map<u32, OGLTextureView> view_cache;
+    OGLTextureView main_view;
 
-    OGLTextureView texture_view;
-    u32 swizzle{};
-    bool is_proxy{};
+    // Use an invalid default so it always fails the comparison test
+    u32 current_swizzle = 0xffffffff;
+    GLuint current_view = 0;
 };
 
 class TextureCacheOpenGL final : public TextureCacheBase {
 public:
-    explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                                const Device& device, StateTracker& state_tracker);
+    explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+                                Tegra::Engines::Maxwell3D& maxwell3d,
+                                Tegra::MemoryManager& gpu_memory, const Device& device,
+                                StateTracker& state_tracker);
     ~TextureCacheOpenGL();
 
 protected:
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 89f0e04ef..a8be2aa37 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -47,11 +48,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
         default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
+            break;
         }
-    case Maxwell::VertexAttribute::Type::SignedInt:
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -71,9 +73,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
         default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
+            break;
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -87,45 +89,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
         default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
+            break;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +107,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,31 +149,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+        break;
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
+        break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -227,10 +197,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
+    case Tegra::Texture::WrapMode::MirrorOnceClampOGL:
+        if (GL_EXT_texture_mirror_clamp) {
+            return GL_MIRROR_CLAMP_EXT;
+        } else {
+            return GL_MIRROR_CLAMP_TO_EDGE;
+        }
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -252,8 +227,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -275,7 +249,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -339,7 +313,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -359,7 +333,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -390,7 +364,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -421,7 +395,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -432,7 +406,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -445,7 +419,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -484,7 +458,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
@@ -501,5 +475,10 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) {
     return GL_FILL;
 }
 
+inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) {
+    // Enumeration order matches register order. We can convert it arithmetically.
+    return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle);
+}
+
 } // namespace MaxwellToGL
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b2a179746..2ccca1993 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -21,6 +21,8 @@
 #include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
+#include "video_core/host_shaders/opengl_present_frag.h"
+#include "video_core/host_shaders/opengl_present_vert.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
@@ -30,60 +32,6 @@ namespace OpenGL {
 
 namespace {
 
-constexpr std::size_t SWAP_CHAIN_SIZE = 3;
-
-struct Frame {
-    u32 width{};                      /// Width of the frame (to detect resize)
-    u32 height{};                     /// Height of the frame
-    bool color_reloaded{};            /// Texture attachment was recreated (ie: resized)
-    OpenGL::OGLRenderbuffer color{};  /// Buffer shared between the render/present FBO
-    OpenGL::OGLFramebuffer render{};  /// FBO created on the render thread
-    OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread
-    GLsync render_fence{};            /// Fence created on the render thread
-    GLsync present_fence{};           /// Fence created on the presentation thread
-    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
-};
-
-constexpr char VERTEX_SHADER[] = R"(
-#version 430 core
-
-out gl_PerVertex {
-    vec4 gl_Position;
-};
-
-layout (location = 0) in vec2 vert_position;
-layout (location = 1) in vec2 vert_tex_coord;
-layout (location = 0) out vec2 frag_tex_coord;
-
-// This is a truncated 3x3 matrix for 2D transformations:
-// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
-// The third column performs translation.
-// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
-// implicitly be [0, 0, 1]
-layout (location = 0) uniform mat3x2 modelview_matrix;
-
-void main() {
-    // Multiply input position by the rotscale part of the matrix and then manually translate by
-    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
-    // to `vec3(vert_position.xy, 1.0)`
-    gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
-    frag_tex_coord = vert_tex_coord;
-}
-)";
-
-constexpr char FRAGMENT_SHADER[] = R"(
-#version 430 core
-
-layout (location = 0) in vec2 frag_tex_coord;
-layout (location = 0) out vec4 color;
-
-layout (binding = 0) uniform sampler2D color_texture;
-
-void main() {
-    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
-}
-)";
-
 constexpr GLint PositionLocation = 0;
 constexpr GLint TexCoordLocation = 1;
 constexpr GLint ModelViewMatrixLocation = 0;
@@ -96,24 +44,6 @@ struct ScreenRectVertex {
     std::array<GLfloat, 2> tex_coord;
 };
 
-/// Returns true if any debug tool is attached
-bool HasDebugTool() {
-    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
-    if (nsight) {
-        return true;
-    }
-
-    GLint num_extensions;
-    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
-    for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
-        const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
-        if (!std::strcmp(name, "GL_EXT_debug_tool")) {
-            return true;
-        }
-    }
-    return false;
-}
-
 /**
  * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
  * corner and (width, height) on the lower-bottom.
@@ -197,132 +127,15 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
 
 } // Anonymous namespace
 
-/**
- * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
- * but also make sure that rendering happens at the pace that the frontend dictates. This is a
- * helper class that the renderer uses to sync frames between the render thread and the presentation
- * thread
- */
-class FrameMailbox {
-public:
-    std::mutex swap_chain_lock;
-    std::condition_variable present_cv;
-    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
-    std::queue<Frame*> free_queue;
-    std::deque<Frame*> present_queue;
-    Frame* previous_frame{};
-
-    FrameMailbox() {
-        for (auto& frame : swap_chain) {
-            free_queue.push(&frame);
-        }
-    }
-
-    ~FrameMailbox() {
-        // lock the mutex and clear out the present and free_queues and notify any people who are
-        // blocked to prevent deadlock on shutdown
-        std::scoped_lock lock{swap_chain_lock};
-        std::queue<Frame*>().swap(free_queue);
-        present_queue.clear();
-        present_cv.notify_all();
-    }
-
-    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
-        frame->present.Release();
-        frame->present.Create();
-        GLint previous_draw_fbo{};
-        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
-        frame->color_reloaded = false;
-    }
-
-    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
-        // Recreate the color texture attachment
-        frame->color.Release();
-        frame->color.Create();
-        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
-        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
-
-        // Recreate the FBO for the render target
-        frame->render.Release();
-        frame->render.Create();
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
-        }
-
-        frame->width = width;
-        frame->height = height;
-        frame->color_reloaded = true;
-    }
-
-    Frame* GetRenderFrame() {
-        std::unique_lock lock{swap_chain_lock};
-
-        // If theres no free frames, we will reuse the oldest render frame
-        if (free_queue.empty()) {
-            auto frame = present_queue.back();
-            present_queue.pop_back();
-            return frame;
-        }
-
-        Frame* frame = free_queue.front();
-        free_queue.pop();
-        return frame;
-    }
-
-    void ReleaseRenderFrame(Frame* frame) {
-        std::unique_lock lock{swap_chain_lock};
-        present_queue.push_front(frame);
-        present_cv.notify_one();
-    }
-
-    Frame* TryGetPresentFrame(int timeout_ms) {
-        std::unique_lock lock{swap_chain_lock};
-        // wait for new entries in the present_queue
-        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
-                            [&] { return !present_queue.empty(); });
-        if (present_queue.empty()) {
-            // timed out waiting for a frame to draw so return the previous frame
-            return previous_frame;
-        }
-
-        // free the previous frame and add it back to the free queue
-        if (previous_frame) {
-            free_queue.push(previous_frame);
-        }
-
-        // the newest entries are pushed to the front of the queue
-        Frame* frame = present_queue.front();
-        present_queue.pop_front();
-        // remove all old entries from the present queue and move them back to the free_queue
-        for (auto f : present_queue) {
-            free_queue.push(f);
-        }
-        present_queue.clear();
-        previous_frame = frame;
-        return frame;
-    }
-};
-
-RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
-                               Core::Frontend::GraphicsContext& context)
-    : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
-      has_debug_tool{HasDebugTool()} {}
+RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
+                               Core::Frontend::EmuWindow& emu_window_,
+                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
+                               std::unique_ptr<Core::Frontend::GraphicsContext> context)
+    : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_},
+      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {}
 
 RendererOpenGL::~RendererOpenGL() = default;
 
-MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64));
-MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128));
-
 void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     if (!framebuffer) {
         return;
@@ -331,79 +144,34 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     PrepareRendertarget(framebuffer);
     RenderScreenshot();
 
-    Frame* frame;
-    {
-        MICROPROFILE_SCOPE(OpenGL_WaitPresent);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+    DrawScreen(emu_window.GetFramebufferLayout());
 
-        frame = frame_mailbox->GetRenderFrame();
+    ++m_current_frame;
 
-        // Clean up sync objects before drawing
-
-        // INTEL driver workaround. We can't delete the previous render sync object until we are
-        // sure that the presentation is done
-        if (frame->present_fence) {
-            glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
-        }
-
-        // delete the draw fence if the frame wasn't presented
-        if (frame->render_fence) {
-            glDeleteSync(frame->render_fence);
-            frame->render_fence = 0;
-        }
-
-        // wait for the presentation to be done
-        if (frame->present_fence) {
-            glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
-            glDeleteSync(frame->present_fence);
-            frame->present_fence = 0;
-        }
-    }
-
-    {
-        MICROPROFILE_SCOPE(OpenGL_RenderFrame);
-        const auto& layout = render_window.GetFramebufferLayout();
-
-        // Recreate the frame if the size of the window has changed
-        if (layout.width != frame->width || layout.height != frame->height ||
-            screen_info.display_srgb != frame->is_srgb) {
-            LOG_DEBUG(Render_OpenGL, "Reloading render frame");
-            frame->is_srgb = screen_info.display_srgb;
-            frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle);
-        DrawScreen(layout);
-        // Create a fence for the frontend to wait on and swap this frame to OffTex
-        frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-        glFlush();
-        frame_mailbox->ReleaseRenderFrame(frame);
-        m_current_frame++;
-        rasterizer->TickFrame();
-    }
+    rasterizer->TickFrame();
 
     render_window.PollEvents();
-    if (has_debug_tool) {
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
-        Present(0);
-        context.SwapBuffers();
-    }
+    context->SwapBuffers();
 }
 
 void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
-    if (framebuffer) {
-        // If framebuffer is provided, reload it from memory to a texture
-        if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
-            screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
-            screen_info.texture.pixel_format != framebuffer->pixel_format ||
-            gl_framebuffer_data.empty()) {
-            // Reallocate texture if the framebuffer size has changed.
-            // This is expected to not happen very often and hence should not be a
-            // performance problem.
-            ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
-        }
-
-        // Load the framebuffer from memory, draw it to the screen, and swap buffers
-        LoadFBToScreenInfo(*framebuffer);
+    if (!framebuffer) {
+        return;
+    }
+    // If framebuffer is provided, reload it from memory to a texture
+    if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
+        screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
+        screen_info.texture.pixel_format != framebuffer->pixel_format ||
+        gl_framebuffer_data.empty()) {
+        // Reallocate texture if the framebuffer size has changed.
+        // This is expected to not happen very often and hence should not be a
+        // performance problem.
+        ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
     }
+
+    // Load the framebuffer from memory, draw it to the screen, and swap buffers
+    LoadFBToScreenInfo(*framebuffer);
 }
 
 void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
@@ -423,7 +191,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
         VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)};
     const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)};
     const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel};
-    u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)};
+    u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)};
     rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes);
 
     // TODO(Rodrigo): Read this from HLE
@@ -453,23 +221,22 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
 }
 
 void RendererOpenGL::InitOpenGLObjects() {
-    frame_mailbox = std::make_unique<FrameMailbox>();
-
-    glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                 0.0f);
+    glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                 Settings::values.bg_blue.GetValue(), 0.0f);
 
     // Create shader programs
     OGLShader vertex_shader;
-    vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER);
+    vertex_shader.Create(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER);
 
     OGLShader fragment_shader;
-    fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER);
+    fragment_shader.Create(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER);
 
     vertex_program.Create(true, false, vertex_shader.handle);
     fragment_program.Create(true, false, fragment_shader.handle);
 
-    // Create program pipeline
-    program_manager.Create();
+    pipeline.Create();
+    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
+    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
 
     // Generate VBO handle for drawing
     vertex_buffer.Create();
@@ -487,6 +254,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -498,18 +274,18 @@ void RendererOpenGL::AddTelemetryFields() {
     LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
     LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
 
-    auto& telemetry_session = system.TelemetrySession();
-    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor);
-    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model);
-    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version);
+    constexpr auto user_system = Common::Telemetry::FieldType::UserSystem;
+    telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor);
+    telemetry_session.AddField(user_system, "GPU_Model", gpu_model);
+    telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version);
 }
 
 void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
-                                                    program_manager, state_tracker);
+    rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device,
+                                                    screen_info, program_manager, state_tracker);
 }
 
 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
@@ -525,12 +301,12 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
 
     GLint internal_format;
     switch (framebuffer.pixel_format) {
-    case Tegra::FramebufferConfig::PixelFormat::ABGR8:
+    case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM:
         internal_format = GL_RGBA8;
         texture.gl_format = GL_RGBA;
         texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV;
         break;
-    case Tegra::FramebufferConfig::PixelFormat::RGB565:
+    case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM:
         internal_format = GL_RGB565;
         texture.gl_format = GL_RGB;
         texture.gl_type = GL_UNSIGNED_SHORT_5_6_5;
@@ -551,8 +327,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
 void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     if (renderer_settings.set_background_color) {
         // Update background color before drawing
-        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                     0.0f);
+        glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                     Settings::values.bg_blue.GetValue(), 0.0f);
     }
 
     // Set projection matrix
@@ -620,10 +396,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     state_tracker.NotifyClipControl();
     state_tracker.NotifyAlphaTest();
 
-    program_manager.UseVertexShader(vertex_program.handle);
-    program_manager.UseGeometryShader(0);
-    program_manager.UseFragmentShader(fragment_program.handle);
-    program_manager.BindGraphicsPipeline();
+    program_manager.BindHostPipeline(pipeline.handle);
 
     glEnable(GL_CULL_FACE);
     if (screen_info.display_srgb) {
@@ -658,58 +431,21 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
 
     glClear(GL_COLOR_BUFFER_BIT);
     glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-}
 
-bool RendererOpenGL::TryPresent(int timeout_ms) {
-    if (has_debug_tool) {
-        LOG_DEBUG(Render_OpenGL,
-                  "Skipping presentation because we are presenting on the main context");
-        return false;
-    }
-    return Present(timeout_ms);
-}
-
-bool RendererOpenGL::Present(int timeout_ms) {
-    const auto& layout = render_window.GetFramebufferLayout();
-    auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms);
-    if (!frame) {
-        LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present");
-        return false;
-    }
-
-    // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a
-    // readback since we won't be doing any blending
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    // Recreate the presentation FBO if the color attachment was changed
-    if (frame->color_reloaded) {
-        LOG_DEBUG(Render_OpenGL, "Reloading present frame");
-        frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height);
-    }
-    glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED);
-    // INTEL workaround.
-    // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete
-    // it on the emulation thread without too much penalty
-    // glDeleteSync(frame.render_sync);
-    // frame.render_sync = 0;
-
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle);
-    glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height,
-                      GL_COLOR_BUFFER_BIT, GL_LINEAR);
-
-    // Insert fence for the main thread to block on
-    frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-    glFlush();
-
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
-    return true;
+    program_manager.RestoreGuestPipeline();
 }
 
 void RendererOpenGL::RenderScreenshot() {
@@ -726,7 +462,7 @@ void RendererOpenGL::RenderScreenshot() {
     screenshot_framebuffer.Create();
     glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle);
 
-    Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
+    const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
 
     GLuint renderbuffer;
     glGenRenderbuffers(1, &renderbuffer);
@@ -751,8 +487,9 @@ void RendererOpenGL::RenderScreenshot() {
 }
 
 bool RendererOpenGL::Init() {
-    if (GLAD_GL_KHR_debug) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
         glDebugMessageCallback(DebugHandler, nullptr);
     }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 50b647661..9ef181f95 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,22 +9,32 @@
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "video_core/renderer_base.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 
 namespace Core {
 class System;
-}
+class TelemetrySession;
+} // namespace Core
 
 namespace Core::Frontend {
 class EmuWindow;
 }
 
+namespace Core::Memory {
+class Memory;
+}
+
 namespace Layout {
 struct FramebufferLayout;
 }
 
+namespace Tegra {
+class GPU;
+}
+
 namespace OpenGL {
 
 /// Structure used for storing information about the textures for the Switch screen
@@ -45,24 +55,17 @@ struct ScreenInfo {
     TextureInfo texture;
 };
 
-struct PresentationTexture {
-    u32 width = 0;
-    u32 height = 0;
-    OGLTexture texture;
-};
-
-class FrameMailbox;
-
 class RendererOpenGL final : public VideoCore::RendererBase {
 public:
-    explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
-                            Core::Frontend::GraphicsContext& context);
+    explicit RendererOpenGL(Core::TelemetrySession& telemetry_session,
+                            Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory,
+                            Tegra::GPU& gpu,
+                            std::unique_ptr<Core::Frontend::GraphicsContext> context);
     ~RendererOpenGL() override;
 
     bool Init() override;
     void ShutDown() override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    bool TryPresent(int timeout_ms) override;
 
 private:
     /// Initializes the OpenGL state and creates persistent objects.
@@ -90,37 +93,36 @@ private:
 
     void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer);
 
-    bool Present(int timeout_ms);
-
+    Core::TelemetrySession& telemetry_session;
     Core::Frontend::EmuWindow& emu_window;
-    Core::System& system;
-    Core::Frontend::GraphicsContext& context;
+    Core::Memory::Memory& cpu_memory;
+    Tegra::GPU& gpu;
 
-    StateTracker state_tracker{system};
+    const Device device;
+    StateTracker state_tracker{gpu};
 
     // OpenGL object IDs
     OGLBuffer vertex_buffer;
     OGLProgram vertex_program;
     OGLProgram fragment_program;
+    OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
     /// Global dummy shader pipeline
-    GLShader::ProgramManager program_manager;
+    ProgramManager program_manager;
 
     /// OpenGL framebuffer data
     std::vector<u8> gl_framebuffer_data;
 
     /// Used for transforming the framebuffer orientation
-    Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
+    Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{};
     Common::Rectangle<int> framebuffer_crop_rect;
-
-    /// Frame presentation mailbox
-    std::unique_ptr<FrameMailbox> frame_mailbox;
-
-    bool has_debug_tool = false;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index b751086fa..6d7bb16b2 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -14,68 +14,6 @@
 
 namespace OpenGL {
 
-struct VertexArrayPushBuffer::Entry {
-    GLuint binding_index{};
-    const GLuint* buffer{};
-    GLintptr offset{};
-    GLsizei stride{};
-};
-
-VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker)
-    : state_tracker{state_tracker} {}
-
-VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
-
-void VertexArrayPushBuffer::Setup() {
-    index_buffer = nullptr;
-    vertex_buffers.clear();
-}
-
-void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
-    index_buffer = buffer;
-}
-
-void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
-                                            GLintptr offset, GLsizei stride) {
-    vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
-}
-
-void VertexArrayPushBuffer::Bind() {
-    if (index_buffer) {
-        state_tracker.BindIndexBuffer(*index_buffer);
-    }
-
-    for (const auto& entry : vertex_buffers) {
-        glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride);
-    }
-}
-
-struct BindBuffersRangePushBuffer::Entry {
-    GLuint binding;
-    const GLuint* buffer;
-    GLintptr offset;
-    GLsizeiptr size;
-};
-
-BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
-
-BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
-
-void BindBuffersRangePushBuffer::Setup() {
-    entries.clear();
-}
-
-void BindBuffersRangePushBuffer::Push(GLuint binding, const GLuint* buffer, GLintptr offset,
-                                      GLsizeiptr size) {
-    entries.push_back(Entry{binding, buffer, offset, size});
-}
-
-void BindBuffersRangePushBuffer::Bind() {
-    for (const Entry& entry : entries) {
-        glBindBufferRange(target, entry.binding, *entry.buffer, entry.offset, entry.size);
-    }
-}
-
 void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) {
     if (!GLAD_GL_KHR_debug) {
         // We don't need to throw an error as this is just for debugging
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index 47ee3177b..9c09ee12c 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,49 +11,6 @@
 
 namespace OpenGL {
 
-class StateTracker;
-
-class VertexArrayPushBuffer final {
-public:
-    explicit VertexArrayPushBuffer(StateTracker& state_tracker);
-    ~VertexArrayPushBuffer();
-
-    void Setup();
-
-    void SetIndexBuffer(const GLuint* buffer);
-
-    void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
-                         GLsizei stride);
-
-    void Bind();
-
-private:
-    struct Entry;
-
-    StateTracker& state_tracker;
-
-    const GLuint* index_buffer{};
-    std::vector<Entry> vertex_buffers;
-};
-
-class BindBuffersRangePushBuffer final {
-public:
-    explicit BindBuffersRangePushBuffer(GLenum target);
-    ~BindBuffersRangePushBuffer();
-
-    void Setup();
-
-    void Push(GLuint binding, const GLuint* buffer, GLintptr offset, GLsizeiptr size);
-
-    void Bind();
-
-private:
-    struct Entry;
-
-    GLenum target;
-    std::vector<Entry> entries;
-};
-
 void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {});
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index 2bb376555..da5c550ea 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -2,10 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
+#include <cstring>
 #include <tuple>
 
 #include <boost/functional/hash.hpp>
 
+#include "common/cityhash.h"
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 
@@ -13,289 +16,375 @@ namespace Vulkan {
 
 namespace {
 
-constexpr FixedPipelineState::DepthStencil GetDepthStencilState(const Maxwell& regs) {
-    const FixedPipelineState::StencilFace front_stencil(
-        regs.stencil_front_op_fail, regs.stencil_front_op_zfail, regs.stencil_front_op_zpass,
-        regs.stencil_front_func_func);
-    const FixedPipelineState::StencilFace back_stencil =
-        regs.stencil_two_side_enable
-            ? FixedPipelineState::StencilFace(regs.stencil_back_op_fail, regs.stencil_back_op_zfail,
-                                              regs.stencil_back_op_zpass,
-                                              regs.stencil_back_func_func)
-            : front_stencil;
-    return FixedPipelineState::DepthStencil(
-        regs.depth_test_enable == 1, regs.depth_write_enabled == 1, regs.depth_bounds_enable == 1,
-        regs.stencil_enable == 1, regs.depth_test_func, front_stencil, back_stencil);
-}
+constexpr std::size_t POINT = 0;
+constexpr std::size_t LINE = 1;
+constexpr std::size_t POLYGON = 2;
+constexpr std::array POLYGON_OFFSET_ENABLE_LUT = {
+    POINT,   // Points
+    LINE,    // Lines
+    LINE,    // LineLoop
+    LINE,    // LineStrip
+    POLYGON, // Triangles
+    POLYGON, // TriangleStrip
+    POLYGON, // TriangleFan
+    POLYGON, // Quads
+    POLYGON, // QuadStrip
+    POLYGON, // Polygon
+    LINE,    // LinesAdjacency
+    LINE,    // LineStripAdjacency
+    POLYGON, // TrianglesAdjacency
+    POLYGON, // TriangleStripAdjacency
+    POLYGON, // Patches
+};
 
-constexpr FixedPipelineState::InputAssembly GetInputAssemblyState(const Maxwell& regs) {
-    return FixedPipelineState::InputAssembly(
-        regs.draw.topology, regs.primitive_restart.enabled,
-        regs.draw.topology == Maxwell::PrimitiveTopology::Points ? regs.point_size : 0.0f);
-}
+} // Anonymous namespace
 
-constexpr FixedPipelineState::BlendingAttachment GetBlendingAttachmentState(
-    const Maxwell& regs, std::size_t render_target) {
-    const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : render_target];
-    const std::array components = {mask.R != 0, mask.G != 0, mask.B != 0, mask.A != 0};
-
-    const FixedPipelineState::BlendingAttachment default_blending(
-        false, Maxwell::Blend::Equation::Add, Maxwell::Blend::Factor::One,
-        Maxwell::Blend::Factor::Zero, Maxwell::Blend::Equation::Add, Maxwell::Blend::Factor::One,
-        Maxwell::Blend::Factor::Zero, components);
-    if (render_target >= regs.rt_control.count) {
-        return default_blending;
+void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_state) {
+    const std::array enabled_lut = {regs.polygon_offset_point_enable,
+                                    regs.polygon_offset_line_enable,
+                                    regs.polygon_offset_fill_enable};
+    const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
+
+    raw = 0;
+    primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0);
+    depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0);
+    depth_clamp_disabled.Assign(regs.view_volume_clip_control.depth_clamp_disabled.Value());
+    ndc_minus_one_to_one.Assign(regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1 : 0);
+    polygon_mode.Assign(PackPolygonMode(regs.polygon_mode_front));
+    patch_control_points_minus_one.Assign(regs.patch_vertices - 1);
+    tessellation_primitive.Assign(static_cast<u32>(regs.tess_mode.prim.Value()));
+    tessellation_spacing.Assign(static_cast<u32>(regs.tess_mode.spacing.Value()));
+    tessellation_clockwise.Assign(regs.tess_mode.cw.Value());
+    logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0);
+    logic_op.Assign(PackLogicOp(regs.logic_op.operation));
+    rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0);
+    topology.Assign(regs.draw.topology);
+
+    std::memcpy(&point_size, &regs.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast
+
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        binding_divisors[index] =
+            regs.instanced_arrays.IsInstancingEnabled(index) ? regs.vertex_array[index].divisor : 0;
     }
 
-    if (!regs.independent_blend_enable) {
-        const auto& src = regs.blend;
-        if (!src.enable[render_target]) {
-            return default_blending;
-        }
-        return FixedPipelineState::BlendingAttachment(
-            true, src.equation_rgb, src.factor_source_rgb, src.factor_dest_rgb, src.equation_a,
-            src.factor_source_a, src.factor_dest_a, components);
+    for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
+        const auto& input = regs.vertex_attrib_format[index];
+        auto& attribute = attributes[index];
+        attribute.raw = 0;
+        attribute.enabled.Assign(input.IsConstant() ? 0 : 1);
+        attribute.buffer.Assign(input.buffer);
+        attribute.offset.Assign(input.offset);
+        attribute.type.Assign(static_cast<u32>(input.type.Value()));
+        attribute.size.Assign(static_cast<u32>(input.size.Value()));
     }
 
-    if (!regs.blend.enable[render_target]) {
-        return default_blending;
+    for (std::size_t index = 0; index < std::size(attachments); ++index) {
+        attachments[index].Fill(regs, index);
     }
-    const auto& src = regs.independent_blend[render_target];
-    return FixedPipelineState::BlendingAttachment(
-        true, src.equation_rgb, src.factor_source_rgb, src.factor_dest_rgb, src.equation_a,
-        src.factor_source_a, src.factor_dest_a, components);
-}
 
-constexpr FixedPipelineState::ColorBlending GetColorBlendingState(const Maxwell& regs) {
-    return FixedPipelineState::ColorBlending(
-        {regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, regs.blend_color.a},
-        regs.rt_control.count,
-        {GetBlendingAttachmentState(regs, 0), GetBlendingAttachmentState(regs, 1),
-         GetBlendingAttachmentState(regs, 2), GetBlendingAttachmentState(regs, 3),
-         GetBlendingAttachmentState(regs, 4), GetBlendingAttachmentState(regs, 5),
-         GetBlendingAttachmentState(regs, 6), GetBlendingAttachmentState(regs, 7)});
-}
+    const auto& transform = regs.viewport_transform;
+    std::transform(transform.begin(), transform.end(), viewport_swizzles.begin(),
+                   [](const auto& viewport) { return static_cast<u16>(viewport.swizzle.raw); });
 
-constexpr FixedPipelineState::Tessellation GetTessellationState(const Maxwell& regs) {
-    return FixedPipelineState::Tessellation(regs.patch_vertices, regs.tess_mode.prim,
-                                            regs.tess_mode.spacing, regs.tess_mode.cw != 0);
+    if (!has_extended_dynamic_state) {
+        no_extended_dynamic_state.Assign(1);
+        dynamic_state.Fill(regs);
+    }
 }
 
-constexpr std::size_t Point = 0;
-constexpr std::size_t Line = 1;
-constexpr std::size_t Polygon = 2;
-constexpr std::array PolygonOffsetEnableLUT = {
-    Point,   // Points
-    Line,    // Lines
-    Line,    // LineLoop
-    Line,    // LineStrip
-    Polygon, // Triangles
-    Polygon, // TriangleStrip
-    Polygon, // TriangleFan
-    Polygon, // Quads
-    Polygon, // QuadStrip
-    Polygon, // Polygon
-    Line,    // LinesAdjacency
-    Line,    // LineStripAdjacency
-    Polygon, // TrianglesAdjacency
-    Polygon, // TriangleStripAdjacency
-    Polygon, // Patches
-};
+void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size_t index) {
+    const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : index];
 
-constexpr FixedPipelineState::Rasterizer GetRasterizerState(const Maxwell& regs) {
-    const std::array enabled_lut = {regs.polygon_offset_point_enable,
-                                    regs.polygon_offset_line_enable,
-                                    regs.polygon_offset_fill_enable};
-    const auto topology = static_cast<std::size_t>(regs.draw.topology.Value());
-    const bool depth_bias_enabled = enabled_lut[PolygonOffsetEnableLUT[topology]];
-
-    const auto& clip = regs.view_volume_clip_control;
-    const bool depth_clamp_enabled = clip.depth_clamp_near == 1 || clip.depth_clamp_far == 1;
-
-    Maxwell::FrontFace front_face = regs.front_face;
-    if (regs.screen_y_control.triangle_rast_flip != 0 &&
-        regs.viewport_transform[0].scale_y > 0.0f) {
-        if (front_face == Maxwell::FrontFace::CounterClockWise)
-            front_face = Maxwell::FrontFace::ClockWise;
-        else if (front_face == Maxwell::FrontFace::ClockWise)
-            front_face = Maxwell::FrontFace::CounterClockWise;
-    }
+    raw = 0;
+    mask_r.Assign(mask.R);
+    mask_g.Assign(mask.G);
+    mask_b.Assign(mask.B);
+    mask_a.Assign(mask.A);
 
-    const bool gl_ndc = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne;
-    return FixedPipelineState::Rasterizer(regs.cull_test_enabled, depth_bias_enabled,
-                                          depth_clamp_enabled, gl_ndc, regs.cull_face, front_face);
-}
+    // TODO: C++20 Use templated lambda to deduplicate code
 
-} // Anonymous namespace
-
-std::size_t FixedPipelineState::VertexBinding::Hash() const noexcept {
-    return (index << stride) ^ divisor;
-}
+    if (!regs.independent_blend_enable) {
+        const auto& src = regs.blend;
+        if (!src.enable[index]) {
+            return;
+        }
+        equation_rgb.Assign(PackBlendEquation(src.equation_rgb));
+        equation_a.Assign(PackBlendEquation(src.equation_a));
+        factor_source_rgb.Assign(PackBlendFactor(src.factor_source_rgb));
+        factor_dest_rgb.Assign(PackBlendFactor(src.factor_dest_rgb));
+        factor_source_a.Assign(PackBlendFactor(src.factor_source_a));
+        factor_dest_a.Assign(PackBlendFactor(src.factor_dest_a));
+        enable.Assign(1);
+        return;
+    }
 
-bool FixedPipelineState::VertexBinding::operator==(const VertexBinding& rhs) const noexcept {
-    return std::tie(index, stride, divisor) == std::tie(rhs.index, rhs.stride, rhs.divisor);
+    if (!regs.blend.enable[index]) {
+        return;
+    }
+    const auto& src = regs.independent_blend[index];
+    equation_rgb.Assign(PackBlendEquation(src.equation_rgb));
+    equation_a.Assign(PackBlendEquation(src.equation_a));
+    factor_source_rgb.Assign(PackBlendFactor(src.factor_source_rgb));
+    factor_dest_rgb.Assign(PackBlendFactor(src.factor_dest_rgb));
+    factor_source_a.Assign(PackBlendFactor(src.factor_source_a));
+    factor_dest_a.Assign(PackBlendFactor(src.factor_dest_a));
+    enable.Assign(1);
 }
 
-std::size_t FixedPipelineState::VertexAttribute::Hash() const noexcept {
-    return static_cast<std::size_t>(index) ^ (static_cast<std::size_t>(buffer) << 13) ^
-           (static_cast<std::size_t>(type) << 22) ^ (static_cast<std::size_t>(size) << 31) ^
-           (static_cast<std::size_t>(offset) << 36);
-}
+void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) {
+    u32 packed_front_face = PackFrontFace(regs.front_face);
+    if (regs.screen_y_control.triangle_rast_flip != 0) {
+        // Flip front face
+        packed_front_face = 1 - packed_front_face;
+    }
 
-bool FixedPipelineState::VertexAttribute::operator==(const VertexAttribute& rhs) const noexcept {
-    return std::tie(index, buffer, type, size, offset) ==
-           std::tie(rhs.index, rhs.buffer, rhs.type, rhs.size, rhs.offset);
+    raw1 = 0;
+    raw2 = 0;
+    front.action_stencil_fail.Assign(PackStencilOp(regs.stencil_front_op_fail));
+    front.action_depth_fail.Assign(PackStencilOp(regs.stencil_front_op_zfail));
+    front.action_depth_pass.Assign(PackStencilOp(regs.stencil_front_op_zpass));
+    front.test_func.Assign(PackComparisonOp(regs.stencil_front_func_func));
+    if (regs.stencil_two_side_enable) {
+        back.action_stencil_fail.Assign(PackStencilOp(regs.stencil_back_op_fail));
+        back.action_depth_fail.Assign(PackStencilOp(regs.stencil_back_op_zfail));
+        back.action_depth_pass.Assign(PackStencilOp(regs.stencil_back_op_zpass));
+        back.test_func.Assign(PackComparisonOp(regs.stencil_back_func_func));
+    } else {
+        back.action_stencil_fail.Assign(front.action_stencil_fail);
+        back.action_depth_fail.Assign(front.action_depth_fail);
+        back.action_depth_pass.Assign(front.action_depth_pass);
+        back.test_func.Assign(front.test_func);
+    }
+    stencil_enable.Assign(regs.stencil_enable);
+    depth_write_enable.Assign(regs.depth_write_enabled);
+    depth_bounds_enable.Assign(regs.depth_bounds_enable);
+    depth_test_enable.Assign(regs.depth_test_enable);
+    front_face.Assign(packed_front_face);
+    depth_test_func.Assign(PackComparisonOp(regs.depth_test_func));
+    cull_face.Assign(PackCullFace(regs.cull_face));
+    cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0);
+
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        const auto& input = regs.vertex_array[index];
+        VertexBinding& binding = vertex_bindings[index];
+        binding.raw = 0;
+        binding.enabled.Assign(input.IsEnabled() ? 1 : 0);
+        binding.stride.Assign(static_cast<u16>(input.stride.Value()));
+    }
 }
 
-std::size_t FixedPipelineState::StencilFace::Hash() const noexcept {
-    return static_cast<std::size_t>(action_stencil_fail) ^
-           (static_cast<std::size_t>(action_depth_fail) << 4) ^
-           (static_cast<std::size_t>(action_depth_fail) << 20) ^
-           (static_cast<std::size_t>(action_depth_pass) << 36);
+std::size_t FixedPipelineState::Hash() const noexcept {
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size());
+    return static_cast<std::size_t>(hash);
 }
 
-bool FixedPipelineState::StencilFace::operator==(const StencilFace& rhs) const noexcept {
-    return std::tie(action_stencil_fail, action_depth_fail, action_depth_pass, test_func) ==
-           std::tie(rhs.action_stencil_fail, rhs.action_depth_fail, rhs.action_depth_pass,
-                    rhs.test_func);
+bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept {
+    return std::memcmp(this, &rhs, Size()) == 0;
 }
 
-std::size_t FixedPipelineState::BlendingAttachment::Hash() const noexcept {
-    return static_cast<std::size_t>(enable) ^ (static_cast<std::size_t>(rgb_equation) << 5) ^
-           (static_cast<std::size_t>(src_rgb_func) << 10) ^
-           (static_cast<std::size_t>(dst_rgb_func) << 15) ^
-           (static_cast<std::size_t>(a_equation) << 20) ^
-           (static_cast<std::size_t>(src_a_func) << 25) ^
-           (static_cast<std::size_t>(dst_a_func) << 30) ^
-           (static_cast<std::size_t>(components[0]) << 35) ^
-           (static_cast<std::size_t>(components[1]) << 36) ^
-           (static_cast<std::size_t>(components[2]) << 37) ^
-           (static_cast<std::size_t>(components[3]) << 38);
+u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept {
+    // OpenGL enums go from 0x200 to 0x207 and the others from 1 to 8
+    // If we substract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range.
+    // Perfect for a hash.
+    const u32 value = static_cast<u32>(op);
+    return value - (value >= 0x200 ? 0x200 : 1);
 }
 
-bool FixedPipelineState::BlendingAttachment::operator==(const BlendingAttachment& rhs) const
-    noexcept {
-    return std::tie(enable, rgb_equation, src_rgb_func, dst_rgb_func, a_equation, src_a_func,
-                    dst_a_func, components) ==
-           std::tie(rhs.enable, rhs.rgb_equation, rhs.src_rgb_func, rhs.dst_rgb_func,
-                    rhs.a_equation, rhs.src_a_func, rhs.dst_a_func, rhs.components);
+Maxwell::ComparisonOp FixedPipelineState::UnpackComparisonOp(u32 packed) noexcept {
+    // Read PackComparisonOp for the logic behind this.
+    return static_cast<Maxwell::ComparisonOp>(packed + 1);
 }
 
-std::size_t FixedPipelineState::VertexInput::Hash() const noexcept {
-    std::size_t hash = num_bindings ^ (num_attributes << 32);
-    for (std::size_t i = 0; i < num_bindings; ++i) {
-        boost::hash_combine(hash, bindings[i].Hash());
+u32 FixedPipelineState::PackStencilOp(Maxwell::StencilOp op) noexcept {
+    switch (op) {
+    case Maxwell::StencilOp::Keep:
+    case Maxwell::StencilOp::KeepOGL:
+        return 0;
+    case Maxwell::StencilOp::Zero:
+    case Maxwell::StencilOp::ZeroOGL:
+        return 1;
+    case Maxwell::StencilOp::Replace:
+    case Maxwell::StencilOp::ReplaceOGL:
+        return 2;
+    case Maxwell::StencilOp::Incr:
+    case Maxwell::StencilOp::IncrOGL:
+        return 3;
+    case Maxwell::StencilOp::Decr:
+    case Maxwell::StencilOp::DecrOGL:
+        return 4;
+    case Maxwell::StencilOp::Invert:
+    case Maxwell::StencilOp::InvertOGL:
+        return 5;
+    case Maxwell::StencilOp::IncrWrap:
+    case Maxwell::StencilOp::IncrWrapOGL:
+        return 6;
+    case Maxwell::StencilOp::DecrWrap:
+    case Maxwell::StencilOp::DecrWrapOGL:
+        return 7;
     }
-    for (std::size_t i = 0; i < num_attributes; ++i) {
-        boost::hash_combine(hash, attributes[i].Hash());
-    }
-    return hash;
+    return 0;
 }
 
-bool FixedPipelineState::VertexInput::operator==(const VertexInput& rhs) const noexcept {
-    return std::equal(bindings.begin(), bindings.begin() + num_bindings, rhs.bindings.begin(),
-                      rhs.bindings.begin() + rhs.num_bindings) &&
-           std::equal(attributes.begin(), attributes.begin() + num_attributes,
-                      rhs.attributes.begin(), rhs.attributes.begin() + rhs.num_attributes);
+Maxwell::StencilOp FixedPipelineState::UnpackStencilOp(u32 packed) noexcept {
+    static constexpr std::array LUT = {Maxwell::StencilOp::Keep,     Maxwell::StencilOp::Zero,
+                                       Maxwell::StencilOp::Replace,  Maxwell::StencilOp::Incr,
+                                       Maxwell::StencilOp::Decr,     Maxwell::StencilOp::Invert,
+                                       Maxwell::StencilOp::IncrWrap, Maxwell::StencilOp::DecrWrap};
+    return LUT[packed];
 }
 
-std::size_t FixedPipelineState::InputAssembly::Hash() const noexcept {
-    std::size_t point_size_int = 0;
-    std::memcpy(&point_size_int, &point_size, sizeof(point_size));
-    return (static_cast<std::size_t>(topology) << 24) ^ (point_size_int << 32) ^
-           static_cast<std::size_t>(primitive_restart_enable);
+u32 FixedPipelineState::PackCullFace(Maxwell::CullFace cull) noexcept {
+    // FrontAndBack is 0x408, by substracting 0x406 in it we get 2.
+    // Individual cull faces are in 0x404 and 0x405, substracting 0x404 we get 0 and 1.
+    const u32 value = static_cast<u32>(cull);
+    return value - (value == 0x408 ? 0x406 : 0x404);
 }
 
-bool FixedPipelineState::InputAssembly::operator==(const InputAssembly& rhs) const noexcept {
-    return std::tie(topology, primitive_restart_enable, point_size) ==
-           std::tie(rhs.topology, rhs.primitive_restart_enable, rhs.point_size);
+Maxwell::CullFace FixedPipelineState::UnpackCullFace(u32 packed) noexcept {
+    static constexpr std::array LUT = {Maxwell::CullFace::Front, Maxwell::CullFace::Back,
+                                       Maxwell::CullFace::FrontAndBack};
+    return LUT[packed];
 }
 
-std::size_t FixedPipelineState::Tessellation::Hash() const noexcept {
-    return static_cast<std::size_t>(patch_control_points) ^
-           (static_cast<std::size_t>(primitive) << 6) ^ (static_cast<std::size_t>(spacing) << 8) ^
-           (static_cast<std::size_t>(clockwise) << 10);
+u32 FixedPipelineState::PackFrontFace(Maxwell::FrontFace face) noexcept {
+    return static_cast<u32>(face) - 0x900;
 }
 
-bool FixedPipelineState::Tessellation::operator==(const Tessellation& rhs) const noexcept {
-    return std::tie(patch_control_points, primitive, spacing, clockwise) ==
-           std::tie(rhs.patch_control_points, rhs.primitive, rhs.spacing, rhs.clockwise);
+Maxwell::FrontFace FixedPipelineState::UnpackFrontFace(u32 packed) noexcept {
+    return static_cast<Maxwell::FrontFace>(packed + 0x900);
 }
 
-std::size_t FixedPipelineState::Rasterizer::Hash() const noexcept {
-    return static_cast<std::size_t>(cull_enable) ^
-           (static_cast<std::size_t>(depth_bias_enable) << 1) ^
-           (static_cast<std::size_t>(depth_clamp_enable) << 2) ^
-           (static_cast<std::size_t>(ndc_minus_one_to_one) << 3) ^
-           (static_cast<std::size_t>(cull_face) << 24) ^
-           (static_cast<std::size_t>(front_face) << 48);
+u32 FixedPipelineState::PackPolygonMode(Maxwell::PolygonMode mode) noexcept {
+    return static_cast<u32>(mode) - 0x1B00;
 }
 
-bool FixedPipelineState::Rasterizer::operator==(const Rasterizer& rhs) const noexcept {
-    return std::tie(cull_enable, depth_bias_enable, depth_clamp_enable, ndc_minus_one_to_one,
-                    cull_face, front_face) ==
-           std::tie(rhs.cull_enable, rhs.depth_bias_enable, rhs.depth_clamp_enable,
-                    rhs.ndc_minus_one_to_one, rhs.cull_face, rhs.front_face);
+Maxwell::PolygonMode FixedPipelineState::UnpackPolygonMode(u32 packed) noexcept {
+    return static_cast<Maxwell::PolygonMode>(packed + 0x1B00);
 }
 
-std::size_t FixedPipelineState::DepthStencil::Hash() const noexcept {
-    std::size_t hash = static_cast<std::size_t>(depth_test_enable) ^
-                       (static_cast<std::size_t>(depth_write_enable) << 1) ^
-                       (static_cast<std::size_t>(depth_bounds_enable) << 2) ^
-                       (static_cast<std::size_t>(stencil_enable) << 3) ^
-                       (static_cast<std::size_t>(depth_test_function) << 4);
-    boost::hash_combine(hash, front_stencil.Hash());
-    boost::hash_combine(hash, back_stencil.Hash());
-    return hash;
+u32 FixedPipelineState::PackLogicOp(Maxwell::LogicOperation op) noexcept {
+    return static_cast<u32>(op) - 0x1500;
 }
 
-bool FixedPipelineState::DepthStencil::operator==(const DepthStencil& rhs) const noexcept {
-    return std::tie(depth_test_enable, depth_write_enable, depth_bounds_enable, depth_test_function,
-                    stencil_enable, front_stencil, back_stencil) ==
-           std::tie(rhs.depth_test_enable, rhs.depth_write_enable, rhs.depth_bounds_enable,
-                    rhs.depth_test_function, rhs.stencil_enable, rhs.front_stencil,
-                    rhs.back_stencil);
+Maxwell::LogicOperation FixedPipelineState::UnpackLogicOp(u32 packed) noexcept {
+    return static_cast<Maxwell::LogicOperation>(packed + 0x1500);
 }
 
-std::size_t FixedPipelineState::ColorBlending::Hash() const noexcept {
-    std::size_t hash = attachments_count << 13;
-    for (std::size_t rt = 0; rt < static_cast<std::size_t>(attachments_count); ++rt) {
-        boost::hash_combine(hash, attachments[rt].Hash());
+u32 FixedPipelineState::PackBlendEquation(Maxwell::Blend::Equation equation) noexcept {
+    switch (equation) {
+    case Maxwell::Blend::Equation::Add:
+    case Maxwell::Blend::Equation::AddGL:
+        return 0;
+    case Maxwell::Blend::Equation::Subtract:
+    case Maxwell::Blend::Equation::SubtractGL:
+        return 1;
+    case Maxwell::Blend::Equation::ReverseSubtract:
+    case Maxwell::Blend::Equation::ReverseSubtractGL:
+        return 2;
+    case Maxwell::Blend::Equation::Min:
+    case Maxwell::Blend::Equation::MinGL:
+        return 3;
+    case Maxwell::Blend::Equation::Max:
+    case Maxwell::Blend::Equation::MaxGL:
+        return 4;
     }
-    return hash;
+    return 0;
 }
 
-bool FixedPipelineState::ColorBlending::operator==(const ColorBlending& rhs) const noexcept {
-    return std::equal(attachments.begin(), attachments.begin() + attachments_count,
-                      rhs.attachments.begin(), rhs.attachments.begin() + rhs.attachments_count);
+Maxwell::Blend::Equation FixedPipelineState::UnpackBlendEquation(u32 packed) noexcept {
+    static constexpr std::array LUT = {
+        Maxwell::Blend::Equation::Add, Maxwell::Blend::Equation::Subtract,
+        Maxwell::Blend::Equation::ReverseSubtract, Maxwell::Blend::Equation::Min,
+        Maxwell::Blend::Equation::Max};
+    return LUT[packed];
 }
 
-std::size_t FixedPipelineState::Hash() const noexcept {
-    std::size_t hash = 0;
-    boost::hash_combine(hash, vertex_input.Hash());
-    boost::hash_combine(hash, input_assembly.Hash());
-    boost::hash_combine(hash, tessellation.Hash());
-    boost::hash_combine(hash, rasterizer.Hash());
-    boost::hash_combine(hash, depth_stencil.Hash());
-    boost::hash_combine(hash, color_blending.Hash());
-    return hash;
-}
-
-bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept {
-    return std::tie(vertex_input, input_assembly, tessellation, rasterizer, depth_stencil,
-                    color_blending) == std::tie(rhs.vertex_input, rhs.input_assembly,
-                                                rhs.tessellation, rhs.rasterizer, rhs.depth_stencil,
-                                                rhs.color_blending);
+u32 FixedPipelineState::PackBlendFactor(Maxwell::Blend::Factor factor) noexcept {
+    switch (factor) {
+    case Maxwell::Blend::Factor::Zero:
+    case Maxwell::Blend::Factor::ZeroGL:
+        return 0;
+    case Maxwell::Blend::Factor::One:
+    case Maxwell::Blend::Factor::OneGL:
+        return 1;
+    case Maxwell::Blend::Factor::SourceColor:
+    case Maxwell::Blend::Factor::SourceColorGL:
+        return 2;
+    case Maxwell::Blend::Factor::OneMinusSourceColor:
+    case Maxwell::Blend::Factor::OneMinusSourceColorGL:
+        return 3;
+    case Maxwell::Blend::Factor::SourceAlpha:
+    case Maxwell::Blend::Factor::SourceAlphaGL:
+        return 4;
+    case Maxwell::Blend::Factor::OneMinusSourceAlpha:
+    case Maxwell::Blend::Factor::OneMinusSourceAlphaGL:
+        return 5;
+    case Maxwell::Blend::Factor::DestAlpha:
+    case Maxwell::Blend::Factor::DestAlphaGL:
+        return 6;
+    case Maxwell::Blend::Factor::OneMinusDestAlpha:
+    case Maxwell::Blend::Factor::OneMinusDestAlphaGL:
+        return 7;
+    case Maxwell::Blend::Factor::DestColor:
+    case Maxwell::Blend::Factor::DestColorGL:
+        return 8;
+    case Maxwell::Blend::Factor::OneMinusDestColor:
+    case Maxwell::Blend::Factor::OneMinusDestColorGL:
+        return 9;
+    case Maxwell::Blend::Factor::SourceAlphaSaturate:
+    case Maxwell::Blend::Factor::SourceAlphaSaturateGL:
+        return 10;
+    case Maxwell::Blend::Factor::Source1Color:
+    case Maxwell::Blend::Factor::Source1ColorGL:
+        return 11;
+    case Maxwell::Blend::Factor::OneMinusSource1Color:
+    case Maxwell::Blend::Factor::OneMinusSource1ColorGL:
+        return 12;
+    case Maxwell::Blend::Factor::Source1Alpha:
+    case Maxwell::Blend::Factor::Source1AlphaGL:
+        return 13;
+    case Maxwell::Blend::Factor::OneMinusSource1Alpha:
+    case Maxwell::Blend::Factor::OneMinusSource1AlphaGL:
+        return 14;
+    case Maxwell::Blend::Factor::ConstantColor:
+    case Maxwell::Blend::Factor::ConstantColorGL:
+        return 15;
+    case Maxwell::Blend::Factor::OneMinusConstantColor:
+    case Maxwell::Blend::Factor::OneMinusConstantColorGL:
+        return 16;
+    case Maxwell::Blend::Factor::ConstantAlpha:
+    case Maxwell::Blend::Factor::ConstantAlphaGL:
+        return 17;
+    case Maxwell::Blend::Factor::OneMinusConstantAlpha:
+    case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
+        return 18;
+    }
+    return 0;
 }
 
-FixedPipelineState GetFixedPipelineState(const Maxwell& regs) {
-    FixedPipelineState fixed_state;
-    fixed_state.input_assembly = GetInputAssemblyState(regs);
-    fixed_state.tessellation = GetTessellationState(regs);
-    fixed_state.rasterizer = GetRasterizerState(regs);
-    fixed_state.depth_stencil = GetDepthStencilState(regs);
-    fixed_state.color_blending = GetColorBlendingState(regs);
-    return fixed_state;
+Maxwell::Blend::Factor FixedPipelineState::UnpackBlendFactor(u32 packed) noexcept {
+    static constexpr std::array LUT = {
+        Maxwell::Blend::Factor::Zero,
+        Maxwell::Blend::Factor::One,
+        Maxwell::Blend::Factor::SourceColor,
+        Maxwell::Blend::Factor::OneMinusSourceColor,
+        Maxwell::Blend::Factor::SourceAlpha,
+        Maxwell::Blend::Factor::OneMinusSourceAlpha,
+        Maxwell::Blend::Factor::DestAlpha,
+        Maxwell::Blend::Factor::OneMinusDestAlpha,
+        Maxwell::Blend::Factor::DestColor,
+        Maxwell::Blend::Factor::OneMinusDestColor,
+        Maxwell::Blend::Factor::SourceAlphaSaturate,
+        Maxwell::Blend::Factor::Source1Color,
+        Maxwell::Blend::Factor::OneMinusSource1Color,
+        Maxwell::Blend::Factor::Source1Alpha,
+        Maxwell::Blend::Factor::OneMinusSource1Alpha,
+        Maxwell::Blend::Factor::ConstantColor,
+        Maxwell::Blend::Factor::OneMinusConstantColor,
+        Maxwell::Blend::Factor::ConstantAlpha,
+        Maxwell::Blend::Factor::OneMinusConstantAlpha,
+    };
+    return LUT[packed];
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
index 4c8ba7f90..2c18eeaae 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <type_traits>
 
+#include "common/bit_field.h"
 #include "common/common_types.h"
 
 #include "video_core/engines/maxwell_3d.h"
@@ -16,230 +17,184 @@ namespace Vulkan {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-// TODO(Rodrigo): Optimize this structure.
-
 struct FixedPipelineState {
-    using PixelFormat = VideoCore::Surface::PixelFormat;
-
-    struct VertexBinding {
-        constexpr VertexBinding(u32 index, u32 stride, u32 divisor)
-            : index{index}, stride{stride}, divisor{divisor} {}
-        VertexBinding() = default;
-
-        u32 index;
-        u32 stride;
-        u32 divisor;
+    static u32 PackComparisonOp(Maxwell::ComparisonOp op) noexcept;
+    static Maxwell::ComparisonOp UnpackComparisonOp(u32 packed) noexcept;
 
-        std::size_t Hash() const noexcept;
+    static u32 PackStencilOp(Maxwell::StencilOp op) noexcept;
+    static Maxwell::StencilOp UnpackStencilOp(u32 packed) noexcept;
 
-        bool operator==(const VertexBinding& rhs) const noexcept;
+    static u32 PackCullFace(Maxwell::CullFace cull) noexcept;
+    static Maxwell::CullFace UnpackCullFace(u32 packed) noexcept;
 
-        bool operator!=(const VertexBinding& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-    };
+    static u32 PackFrontFace(Maxwell::FrontFace face) noexcept;
+    static Maxwell::FrontFace UnpackFrontFace(u32 packed) noexcept;
 
-    struct VertexAttribute {
-        constexpr VertexAttribute(u32 index, u32 buffer, Maxwell::VertexAttribute::Type type,
-                                  Maxwell::VertexAttribute::Size size, u32 offset)
-            : index{index}, buffer{buffer}, type{type}, size{size}, offset{offset} {}
-        VertexAttribute() = default;
+    static u32 PackPolygonMode(Maxwell::PolygonMode mode) noexcept;
+    static Maxwell::PolygonMode UnpackPolygonMode(u32 packed) noexcept;
 
-        u32 index;
-        u32 buffer;
-        Maxwell::VertexAttribute::Type type;
-        Maxwell::VertexAttribute::Size size;
-        u32 offset;
+    static u32 PackLogicOp(Maxwell::LogicOperation op) noexcept;
+    static Maxwell::LogicOperation UnpackLogicOp(u32 packed) noexcept;
 
-        std::size_t Hash() const noexcept;
+    static u32 PackBlendEquation(Maxwell::Blend::Equation equation) noexcept;
+    static Maxwell::Blend::Equation UnpackBlendEquation(u32 packed) noexcept;
 
-        bool operator==(const VertexAttribute& rhs) const noexcept;
+    static u32 PackBlendFactor(Maxwell::Blend::Factor factor) noexcept;
+    static Maxwell::Blend::Factor UnpackBlendFactor(u32 packed) noexcept;
 
-        bool operator!=(const VertexAttribute& rhs) const noexcept {
-            return !operator==(rhs);
+    struct BlendingAttachment {
+        union {
+            u32 raw;
+            BitField<0, 1, u32> mask_r;
+            BitField<1, 1, u32> mask_g;
+            BitField<2, 1, u32> mask_b;
+            BitField<3, 1, u32> mask_a;
+            BitField<4, 3, u32> equation_rgb;
+            BitField<7, 3, u32> equation_a;
+            BitField<10, 5, u32> factor_source_rgb;
+            BitField<15, 5, u32> factor_dest_rgb;
+            BitField<20, 5, u32> factor_source_a;
+            BitField<25, 5, u32> factor_dest_a;
+            BitField<30, 1, u32> enable;
+        };
+
+        void Fill(const Maxwell& regs, std::size_t index);
+
+        constexpr std::array<bool, 4> Mask() const noexcept {
+            return {mask_r != 0, mask_g != 0, mask_b != 0, mask_a != 0};
         }
-    };
-
-    struct StencilFace {
-        constexpr StencilFace(Maxwell::StencilOp action_stencil_fail,
-                              Maxwell::StencilOp action_depth_fail,
-                              Maxwell::StencilOp action_depth_pass, Maxwell::ComparisonOp test_func)
-            : action_stencil_fail{action_stencil_fail}, action_depth_fail{action_depth_fail},
-              action_depth_pass{action_depth_pass}, test_func{test_func} {}
-        StencilFace() = default;
-
-        Maxwell::StencilOp action_stencil_fail;
-        Maxwell::StencilOp action_depth_fail;
-        Maxwell::StencilOp action_depth_pass;
-        Maxwell::ComparisonOp test_func;
 
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const StencilFace& rhs) const noexcept;
-
-        bool operator!=(const StencilFace& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::Blend::Equation EquationRGB() const noexcept {
+            return UnpackBlendEquation(equation_rgb.Value());
         }
-    };
 
-    struct BlendingAttachment {
-        constexpr BlendingAttachment(bool enable, Maxwell::Blend::Equation rgb_equation,
-                                     Maxwell::Blend::Factor src_rgb_func,
-                                     Maxwell::Blend::Factor dst_rgb_func,
-                                     Maxwell::Blend::Equation a_equation,
-                                     Maxwell::Blend::Factor src_a_func,
-                                     Maxwell::Blend::Factor dst_a_func,
-                                     std::array<bool, 4> components)
-            : enable{enable}, rgb_equation{rgb_equation}, src_rgb_func{src_rgb_func},
-              dst_rgb_func{dst_rgb_func}, a_equation{a_equation}, src_a_func{src_a_func},
-              dst_a_func{dst_a_func}, components{components} {}
-        BlendingAttachment() = default;
-
-        bool enable;
-        Maxwell::Blend::Equation rgb_equation;
-        Maxwell::Blend::Factor src_rgb_func;
-        Maxwell::Blend::Factor dst_rgb_func;
-        Maxwell::Blend::Equation a_equation;
-        Maxwell::Blend::Factor src_a_func;
-        Maxwell::Blend::Factor dst_a_func;
-        std::array<bool, 4> components;
-
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const BlendingAttachment& rhs) const noexcept;
-
-        bool operator!=(const BlendingAttachment& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::Blend::Equation EquationAlpha() const noexcept {
+            return UnpackBlendEquation(equation_a.Value());
         }
-    };
 
-    struct VertexInput {
-        std::size_t num_bindings = 0;
-        std::size_t num_attributes = 0;
-        std::array<VertexBinding, Maxwell::NumVertexArrays> bindings;
-        std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes;
+        Maxwell::Blend::Factor SourceRGBFactor() const noexcept {
+            return UnpackBlendFactor(factor_source_rgb.Value());
+        }
 
-        std::size_t Hash() const noexcept;
+        Maxwell::Blend::Factor DestRGBFactor() const noexcept {
+            return UnpackBlendFactor(factor_dest_rgb.Value());
+        }
 
-        bool operator==(const VertexInput& rhs) const noexcept;
+        Maxwell::Blend::Factor SourceAlphaFactor() const noexcept {
+            return UnpackBlendFactor(factor_source_a.Value());
+        }
 
-        bool operator!=(const VertexInput& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::Blend::Factor DestAlphaFactor() const noexcept {
+            return UnpackBlendFactor(factor_dest_a.Value());
         }
     };
 
-    struct InputAssembly {
-        constexpr InputAssembly(Maxwell::PrimitiveTopology topology, bool primitive_restart_enable,
-                                float point_size)
-            : topology{topology}, primitive_restart_enable{primitive_restart_enable},
-              point_size{point_size} {}
-        InputAssembly() = default;
-
-        Maxwell::PrimitiveTopology topology;
-        bool primitive_restart_enable;
-        float point_size;
+    union VertexAttribute {
+        u32 raw;
+        BitField<0, 1, u32> enabled;
+        BitField<1, 5, u32> buffer;
+        BitField<6, 14, u32> offset;
+        BitField<20, 3, u32> type;
+        BitField<23, 6, u32> size;
 
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const InputAssembly& rhs) const noexcept;
+        constexpr Maxwell::VertexAttribute::Type Type() const noexcept {
+            return static_cast<Maxwell::VertexAttribute::Type>(type.Value());
+        }
 
-        bool operator!=(const InputAssembly& rhs) const noexcept {
-            return !operator==(rhs);
+        constexpr Maxwell::VertexAttribute::Size Size() const noexcept {
+            return static_cast<Maxwell::VertexAttribute::Size>(size.Value());
         }
     };
 
-    struct Tessellation {
-        constexpr Tessellation(u32 patch_control_points, Maxwell::TessellationPrimitive primitive,
-                               Maxwell::TessellationSpacing spacing, bool clockwise)
-            : patch_control_points{patch_control_points}, primitive{primitive}, spacing{spacing},
-              clockwise{clockwise} {}
-        Tessellation() = default;
+    template <std::size_t Position>
+    union StencilFace {
+        BitField<Position + 0, 3, u32> action_stencil_fail;
+        BitField<Position + 3, 3, u32> action_depth_fail;
+        BitField<Position + 6, 3, u32> action_depth_pass;
+        BitField<Position + 9, 3, u32> test_func;
 
-        u32 patch_control_points;
-        Maxwell::TessellationPrimitive primitive;
-        Maxwell::TessellationSpacing spacing;
-        bool clockwise;
+        Maxwell::StencilOp ActionStencilFail() const noexcept {
+            return UnpackStencilOp(action_stencil_fail);
+        }
 
-        std::size_t Hash() const noexcept;
+        Maxwell::StencilOp ActionDepthFail() const noexcept {
+            return UnpackStencilOp(action_depth_fail);
+        }
 
-        bool operator==(const Tessellation& rhs) const noexcept;
+        Maxwell::StencilOp ActionDepthPass() const noexcept {
+            return UnpackStencilOp(action_depth_pass);
+        }
 
-        bool operator!=(const Tessellation& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::ComparisonOp TestFunc() const noexcept {
+            return UnpackComparisonOp(test_func);
         }
     };
 
-    struct Rasterizer {
-        constexpr Rasterizer(bool cull_enable, bool depth_bias_enable, bool depth_clamp_enable,
-                             bool ndc_minus_one_to_one, Maxwell::CullFace cull_face,
-                             Maxwell::FrontFace front_face)
-            : cull_enable{cull_enable}, depth_bias_enable{depth_bias_enable},
-              depth_clamp_enable{depth_clamp_enable}, ndc_minus_one_to_one{ndc_minus_one_to_one},
-              cull_face{cull_face}, front_face{front_face} {}
-        Rasterizer() = default;
-
-        bool cull_enable;
-        bool depth_bias_enable;
-        bool depth_clamp_enable;
-        bool ndc_minus_one_to_one;
-        Maxwell::CullFace cull_face;
-        Maxwell::FrontFace front_face;
-
-        std::size_t Hash() const noexcept;
+    union VertexBinding {
+        u16 raw;
+        BitField<0, 12, u16> stride;
+        BitField<12, 1, u16> enabled;
+    };
 
-        bool operator==(const Rasterizer& rhs) const noexcept;
+    struct DynamicState {
+        union {
+            u32 raw1;
+            StencilFace<0> front;
+            StencilFace<12> back;
+            BitField<24, 1, u32> stencil_enable;
+            BitField<25, 1, u32> depth_write_enable;
+            BitField<26, 1, u32> depth_bounds_enable;
+            BitField<27, 1, u32> depth_test_enable;
+            BitField<28, 1, u32> front_face;
+            BitField<29, 3, u32> depth_test_func;
+        };
+        union {
+            u32 raw2;
+            BitField<0, 2, u32> cull_face;
+            BitField<2, 1, u32> cull_enable;
+        };
+        std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings;
+
+        void Fill(const Maxwell& regs);
+
+        Maxwell::ComparisonOp DepthTestFunc() const noexcept {
+            return UnpackComparisonOp(depth_test_func);
+        }
 
-        bool operator!=(const Rasterizer& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::CullFace CullFace() const noexcept {
+            return UnpackCullFace(cull_face.Value());
         }
-    };
 
-    struct DepthStencil {
-        constexpr DepthStencil(bool depth_test_enable, bool depth_write_enable,
-                               bool depth_bounds_enable, bool stencil_enable,
-                               Maxwell::ComparisonOp depth_test_function, StencilFace front_stencil,
-                               StencilFace back_stencil)
-            : depth_test_enable{depth_test_enable}, depth_write_enable{depth_write_enable},
-              depth_bounds_enable{depth_bounds_enable}, stencil_enable{stencil_enable},
-              depth_test_function{depth_test_function}, front_stencil{front_stencil},
-              back_stencil{back_stencil} {}
-        DepthStencil() = default;
-
-        bool depth_test_enable;
-        bool depth_write_enable;
-        bool depth_bounds_enable;
-        bool stencil_enable;
-        Maxwell::ComparisonOp depth_test_function;
-        StencilFace front_stencil;
-        StencilFace back_stencil;
-
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const DepthStencil& rhs) const noexcept;
-
-        bool operator!=(const DepthStencil& rhs) const noexcept {
-            return !operator==(rhs);
+        Maxwell::FrontFace FrontFace() const noexcept {
+            return UnpackFrontFace(front_face.Value());
         }
     };
 
-    struct ColorBlending {
-        constexpr ColorBlending(
-            std::array<float, 4> blend_constants, std::size_t attachments_count,
-            std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments)
-            : attachments_count{attachments_count}, attachments{attachments} {}
-        ColorBlending() = default;
-
-        std::size_t attachments_count;
-        std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments;
-
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const ColorBlending& rhs) const noexcept;
-
-        bool operator!=(const ColorBlending& rhs) const noexcept {
-            return !operator==(rhs);
-        }
+    union {
+        u32 raw;
+        BitField<0, 1, u32> no_extended_dynamic_state;
+        BitField<2, 1, u32> primitive_restart_enable;
+        BitField<3, 1, u32> depth_bias_enable;
+        BitField<4, 1, u32> depth_clamp_disabled;
+        BitField<5, 1, u32> ndc_minus_one_to_one;
+        BitField<6, 2, u32> polygon_mode;
+        BitField<8, 5, u32> patch_control_points_minus_one;
+        BitField<13, 2, u32> tessellation_primitive;
+        BitField<15, 2, u32> tessellation_spacing;
+        BitField<17, 1, u32> tessellation_clockwise;
+        BitField<18, 1, u32> logic_op_enable;
+        BitField<19, 4, u32> logic_op;
+        BitField<23, 1, u32> rasterize_enable;
+        BitField<24, 4, Maxwell::PrimitiveTopology> topology;
     };
+    u32 point_size;
+    std::array<u32, Maxwell::NumVertexArrays> binding_divisors;
+    std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes;
+    std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments;
+    std::array<u16, Maxwell::NumViewports> viewport_swizzles;
+    DynamicState dynamic_state;
+
+    void Fill(const Maxwell& regs, bool has_extended_dynamic_state);
 
     std::size_t Hash() const noexcept;
 
@@ -249,26 +204,14 @@ struct FixedPipelineState {
         return !operator==(rhs);
     }
 
-    VertexInput vertex_input;
-    InputAssembly input_assembly;
-    Tessellation tessellation;
-    Rasterizer rasterizer;
-    DepthStencil depth_stencil;
-    ColorBlending color_blending;
+    std::size_t Size() const noexcept {
+        const std::size_t total_size = sizeof *this;
+        return total_size - (no_extended_dynamic_state != 0 ? 0 : sizeof(DynamicState));
+    }
 };
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexBinding>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexAttribute>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::StencilFace>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::BlendingAttachment>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexInput>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::InputAssembly>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::Tessellation>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::Rasterizer>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::DepthStencil>);
-static_assert(std::is_trivially_copyable_v<FixedPipelineState::ColorBlending>);
+static_assert(std::has_unique_object_representations_v<FixedPipelineState>);
 static_assert(std::is_trivially_copyable_v<FixedPipelineState>);
-
-FixedPipelineState GetFixedPipelineState(const Maxwell& regs);
+static_assert(std::is_trivially_constructible_v<FixedPipelineState>);
 
 } // namespace Vulkan
 
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 8681b821f..d22de1d81 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -21,29 +21,29 @@ namespace Sampler {
 
 VkFilter Filter(Tegra::Texture::TextureFilter filter) {
     switch (filter) {
-    case Tegra::Texture::TextureFilter::Linear:
-        return VK_FILTER_LINEAR;
     case Tegra::Texture::TextureFilter::Nearest:
         return VK_FILTER_NEAREST;
+    case Tegra::Texture::TextureFilter::Linear:
+        return VK_FILTER_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter));
+    UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter));
     return {};
 }
 
 VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) {
     switch (mipmap_filter) {
     case Tegra::Texture::TextureMipmapFilter::None:
-        // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping
-        // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to
-        // use an image view with a single mipmap level to emulate this.
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
-        ;
-    case Tegra::Texture::TextureMipmapFilter::Linear:
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
+        // There are no Vulkan filter modes that directly correspond to OpenGL minification filters
+        // of GL_LINEAR or GL_NEAREST, but they can be emulated using
+        // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter =
+        // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively.
+        return VK_SAMPLER_MIPMAP_MODE_NEAREST;
     case Tegra::Texture::TextureMipmapFilter::Nearest:
         return VK_SAMPLER_MIPMAP_MODE_NEAREST;
+    case Tegra::Texture::TextureMipmapFilter::Linear:
+        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
+    UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
     return {};
 }
 
@@ -118,89 +118,101 @@ struct FormatTuple {
     VkFormat format; ///< Vulkan format
     int usage = 0;   ///< Describes image format usage
 } constexpr tex_format_tuples[] = {
-    {VK_FORMAT_A8B8G8R8_UNORM_PACK32, Attachable | Storage},    // ABGR8U
-    {VK_FORMAT_A8B8G8R8_SNORM_PACK32, Attachable | Storage},    // ABGR8S
-    {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage},     // ABGR8UI
-    {VK_FORMAT_B5G6R5_UNORM_PACK16},                            // B5G6R5U
-    {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10U
-    {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},              // A1B5G5R5U (flipped with swizzle)
-    {VK_FORMAT_R8_UNORM, Attachable | Storage},                 // R8U
-    {VK_FORMAT_R8_UINT, Attachable | Storage},                  // R8UI
-    {VK_FORMAT_R16G16B16A16_SFLOAT, Attachable | Storage},      // RGBA16F
-    {VK_FORMAT_R16G16B16A16_UNORM, Attachable | Storage},       // RGBA16U
-    {VK_FORMAT_R16G16B16A16_SNORM, Attachable | Storage},       // RGBA16S
-    {VK_FORMAT_R16G16B16A16_UINT, Attachable | Storage},        // RGBA16UI
-    {VK_FORMAT_B10G11R11_UFLOAT_PACK32, Attachable | Storage},  // R11FG11FB10F
-    {VK_FORMAT_R32G32B32A32_UINT, Attachable | Storage},        // RGBA32UI
-    {VK_FORMAT_BC1_RGBA_UNORM_BLOCK},                           // DXT1
-    {VK_FORMAT_BC2_UNORM_BLOCK},                                // DXT23
-    {VK_FORMAT_BC3_UNORM_BLOCK},                                // DXT45
-    {VK_FORMAT_BC4_UNORM_BLOCK},                                // DXN1
-    {VK_FORMAT_BC5_UNORM_BLOCK},                                // DXN2UNORM
-    {VK_FORMAT_BC5_SNORM_BLOCK},                                // DXN2SNORM
-    {VK_FORMAT_BC7_UNORM_BLOCK},                                // BC7U
-    {VK_FORMAT_BC6H_UFLOAT_BLOCK},                              // BC6H_UF16
-    {VK_FORMAT_BC6H_SFLOAT_BLOCK},                              // BC6H_SF16
-    {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                           // ASTC_2D_4X4
-    {VK_FORMAT_B8G8R8A8_UNORM},                                 // BGRA8
-    {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},      // RGBA32F
-    {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},            // RG32F
-    {VK_FORMAT_R32_SFLOAT, Attachable | Storage},               // R32F
-    {VK_FORMAT_R16_SFLOAT, Attachable | Storage},               // R16F
-    {VK_FORMAT_R16_UNORM, Attachable | Storage},                // R16U
-    {VK_FORMAT_UNDEFINED},                                      // R16S
-    {VK_FORMAT_UNDEFINED},                                      // R16UI
-    {VK_FORMAT_UNDEFINED},                                      // R16I
-    {VK_FORMAT_R16G16_UNORM, Attachable | Storage},             // RG16
-    {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},            // RG16F
-    {VK_FORMAT_UNDEFINED},                                      // RG16UI
-    {VK_FORMAT_UNDEFINED},                                      // RG16I
-    {VK_FORMAT_R16G16_SNORM, Attachable | Storage},             // RG16S
-    {VK_FORMAT_UNDEFINED},                                      // RGB32F
-    {VK_FORMAT_R8G8B8A8_SRGB, Attachable},                      // RGBA8_SRGB
-    {VK_FORMAT_R8G8_UNORM, Attachable | Storage},               // RG8U
-    {VK_FORMAT_R8G8_SNORM, Attachable | Storage},               // RG8S
-    {VK_FORMAT_R32G32_UINT, Attachable | Storage},              // RG32UI
-    {VK_FORMAT_UNDEFINED},                                      // RGBX16F
-    {VK_FORMAT_R32_UINT, Attachable | Storage},                 // R32UI
-    {VK_FORMAT_R32_SINT, Attachable | Storage},                 // R32I
-    {VK_FORMAT_ASTC_8x8_UNORM_BLOCK},                           // ASTC_2D_8X8
-    {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_8X5
-    {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_5X4
-    {VK_FORMAT_UNDEFINED},                                      // BGRA8_SRGB
-    {VK_FORMAT_BC1_RGBA_SRGB_BLOCK},                            // DXT1_SRGB
-    {VK_FORMAT_BC2_SRGB_BLOCK},                                 // DXT23_SRGB
-    {VK_FORMAT_BC3_SRGB_BLOCK},                                 // DXT45_SRGB
-    {VK_FORMAT_BC7_SRGB_BLOCK},                                 // BC7U_SRGB
-    {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable},              // R4G4B4A4U
-    {VK_FORMAT_ASTC_4x4_SRGB_BLOCK},                            // ASTC_2D_4X4_SRGB
-    {VK_FORMAT_ASTC_8x8_SRGB_BLOCK},                            // ASTC_2D_8X8_SRGB
-    {VK_FORMAT_ASTC_8x5_SRGB_BLOCK},                            // ASTC_2D_8X5_SRGB
-    {VK_FORMAT_ASTC_5x4_SRGB_BLOCK},                            // ASTC_2D_5X4_SRGB
-    {VK_FORMAT_ASTC_5x5_UNORM_BLOCK},                           // ASTC_2D_5X5
-    {VK_FORMAT_ASTC_5x5_SRGB_BLOCK},                            // ASTC_2D_5X5_SRGB
-    {VK_FORMAT_ASTC_10x8_UNORM_BLOCK},                          // ASTC_2D_10X8
-    {VK_FORMAT_ASTC_10x8_SRGB_BLOCK},                           // ASTC_2D_10X8_SRGB
-    {VK_FORMAT_ASTC_6x6_UNORM_BLOCK},                           // ASTC_2D_6X6
-    {VK_FORMAT_ASTC_6x6_SRGB_BLOCK},                            // ASTC_2D_6X6_SRGB
-    {VK_FORMAT_ASTC_10x10_UNORM_BLOCK},                         // ASTC_2D_10X10
-    {VK_FORMAT_ASTC_10x10_SRGB_BLOCK},                          // ASTC_2D_10X10_SRGB
-    {VK_FORMAT_ASTC_12x12_UNORM_BLOCK},                         // ASTC_2D_12X12
-    {VK_FORMAT_ASTC_12x12_SRGB_BLOCK},                          // ASTC_2D_12X12_SRGB
-    {VK_FORMAT_ASTC_8x6_UNORM_BLOCK},                           // ASTC_2D_8X6
-    {VK_FORMAT_ASTC_8x6_SRGB_BLOCK},                            // ASTC_2D_8X6_SRGB
-    {VK_FORMAT_ASTC_6x5_UNORM_BLOCK},                           // ASTC_2D_6X5
-    {VK_FORMAT_ASTC_6x5_SRGB_BLOCK},                            // ASTC_2D_6X5_SRGB
-    {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32},                         // E5B9G9R9F
+    {VK_FORMAT_A8B8G8R8_UNORM_PACK32, Attachable | Storage},    // A8B8G8R8_UNORM
+    {VK_FORMAT_A8B8G8R8_SNORM_PACK32, Attachable | Storage},    // A8B8G8R8_SNORM
+    {VK_FORMAT_A8B8G8R8_SINT_PACK32, Attachable | Storage},     // A8B8G8R8_SINT
+    {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage},     // A8B8G8R8_UINT
+    {VK_FORMAT_R5G6B5_UNORM_PACK16, Attachable},                // R5G6B5_UNORM
+    {VK_FORMAT_B5G6R5_UNORM_PACK16, Attachable},                // B5G6R5_UNORM
+    {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},              // A1R5G5B5_UNORM
+    {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM
+    {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage},  // A2B10G10R10_UINT
+    {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},         // A1B5G5R5_UNORM (flipped with swizzle)
+    {VK_FORMAT_R8_UNORM, Attachable | Storage},            // R8_UNORM
+    {VK_FORMAT_R8_SNORM, Attachable | Storage},            // R8_SNORM
+    {VK_FORMAT_R8_SINT, Attachable | Storage},             // R8_SINT
+    {VK_FORMAT_R8_UINT, Attachable | Storage},             // R8_UINT
+    {VK_FORMAT_R16G16B16A16_SFLOAT, Attachable | Storage}, // R16G16B16A16_FLOAT
+    {VK_FORMAT_R16G16B16A16_UNORM, Attachable | Storage},  // R16G16B16A16_UNORM
+    {VK_FORMAT_R16G16B16A16_SNORM, Attachable | Storage},  // R16G16B16A16_SNORM
+    {VK_FORMAT_R16G16B16A16_SINT, Attachable | Storage},   // R16G16B16A16_SINT
+    {VK_FORMAT_R16G16B16A16_UINT, Attachable | Storage},   // R16G16B16A16_UINT
+    {VK_FORMAT_B10G11R11_UFLOAT_PACK32, Attachable | Storage}, // B10G11R11_FLOAT
+    {VK_FORMAT_R32G32B32A32_UINT, Attachable | Storage},       // R32G32B32A32_UINT
+    {VK_FORMAT_BC1_RGBA_UNORM_BLOCK},                          // BC1_RGBA_UNORM
+    {VK_FORMAT_BC2_UNORM_BLOCK},                               // BC2_UNORM
+    {VK_FORMAT_BC3_UNORM_BLOCK},                               // BC3_UNORM
+    {VK_FORMAT_BC4_UNORM_BLOCK},                               // BC4_UNORM
+    {VK_FORMAT_BC4_SNORM_BLOCK},                               // BC4_SNORM
+    {VK_FORMAT_BC5_UNORM_BLOCK},                               // BC5_UNORM
+    {VK_FORMAT_BC5_SNORM_BLOCK},                               // BC5_SNORM
+    {VK_FORMAT_BC7_UNORM_BLOCK},                               // BC7_UNORM
+    {VK_FORMAT_BC6H_UFLOAT_BLOCK},                             // BC6H_UFLOAT
+    {VK_FORMAT_BC6H_SFLOAT_BLOCK},                             // BC6H_SFLOAT
+    {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                          // ASTC_2D_4X4_UNORM
+    {VK_FORMAT_B8G8R8A8_UNORM, Attachable},                    // B8G8R8A8_UNORM
+    {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},     // R32G32B32A32_FLOAT
+    {VK_FORMAT_R32G32B32A32_SINT, Attachable | Storage},       // R32G32B32A32_SINT
+    {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},           // R32G32_FLOAT
+    {VK_FORMAT_R32G32_SINT, Attachable | Storage},             // R32G32_SINT
+    {VK_FORMAT_R32_SFLOAT, Attachable | Storage},              // R32_FLOAT
+    {VK_FORMAT_R16_SFLOAT, Attachable | Storage},              // R16_FLOAT
+    {VK_FORMAT_R16_UNORM, Attachable | Storage},               // R16_UNORM
+    {VK_FORMAT_UNDEFINED},                                     // R16_SNORM
+    {VK_FORMAT_R16_UINT, Attachable | Storage},                // R16_UINT
+    {VK_FORMAT_UNDEFINED},                                     // R16_SINT
+    {VK_FORMAT_R16G16_UNORM, Attachable | Storage},            // R16G16_UNORM
+    {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},           // R16G16_FLOAT
+    {VK_FORMAT_UNDEFINED},                                     // R16G16_UINT
+    {VK_FORMAT_UNDEFINED},                                     // R16G16_SINT
+    {VK_FORMAT_R16G16_SNORM, Attachable | Storage},            // R16G16_SNORM
+    {VK_FORMAT_UNDEFINED},                                     // R32G32B32_FLOAT
+    {VK_FORMAT_R8G8B8A8_SRGB, Attachable},                     // A8B8G8R8_SRGB
+    {VK_FORMAT_R8G8_UNORM, Attachable | Storage},              // R8G8_UNORM
+    {VK_FORMAT_R8G8_SNORM, Attachable | Storage},              // R8G8_SNORM
+    {VK_FORMAT_R8G8_SINT, Attachable | Storage},               // R8G8_SINT
+    {VK_FORMAT_R8G8_UINT, Attachable | Storage},               // R8G8_UINT
+    {VK_FORMAT_R32G32_UINT, Attachable | Storage},             // R32G32_UINT
+    {VK_FORMAT_UNDEFINED},                                     // R16G16B16X16_FLOAT
+    {VK_FORMAT_R32_UINT, Attachable | Storage},                // R32_UINT
+    {VK_FORMAT_R32_SINT, Attachable | Storage},                // R32_SINT
+    {VK_FORMAT_ASTC_8x8_UNORM_BLOCK},                          // ASTC_2D_8X8_UNORM
+    {VK_FORMAT_UNDEFINED},                                     // ASTC_2D_8X5_UNORM
+    {VK_FORMAT_UNDEFINED},                                     // ASTC_2D_5X4_UNORM
+    {VK_FORMAT_B8G8R8A8_SRGB, Attachable},                     // B8G8R8A8_SRGB
+    {VK_FORMAT_BC1_RGBA_SRGB_BLOCK},                           // BC1_RGBA_SRGB
+    {VK_FORMAT_BC2_SRGB_BLOCK},                                // BC2_SRGB
+    {VK_FORMAT_BC3_SRGB_BLOCK},                                // BC3_SRGB
+    {VK_FORMAT_BC7_SRGB_BLOCK},                                // BC7_SRGB
+    {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable},             // A4B4G4R4_UNORM
+    {VK_FORMAT_ASTC_4x4_SRGB_BLOCK},                           // ASTC_2D_4X4_SRGB
+    {VK_FORMAT_ASTC_8x8_SRGB_BLOCK},                           // ASTC_2D_8X8_SRGB
+    {VK_FORMAT_ASTC_8x5_SRGB_BLOCK},                           // ASTC_2D_8X5_SRGB
+    {VK_FORMAT_ASTC_5x4_SRGB_BLOCK},                           // ASTC_2D_5X4_SRGB
+    {VK_FORMAT_ASTC_5x5_UNORM_BLOCK},                          // ASTC_2D_5X5_UNORM
+    {VK_FORMAT_ASTC_5x5_SRGB_BLOCK},                           // ASTC_2D_5X5_SRGB
+    {VK_FORMAT_ASTC_10x8_UNORM_BLOCK},                         // ASTC_2D_10X8_UNORM
+    {VK_FORMAT_ASTC_10x8_SRGB_BLOCK},                          // ASTC_2D_10X8_SRGB
+    {VK_FORMAT_ASTC_6x6_UNORM_BLOCK},                          // ASTC_2D_6X6_UNORM
+    {VK_FORMAT_ASTC_6x6_SRGB_BLOCK},                           // ASTC_2D_6X6_SRGB
+    {VK_FORMAT_ASTC_10x10_UNORM_BLOCK},                        // ASTC_2D_10X10_UNORM
+    {VK_FORMAT_ASTC_10x10_SRGB_BLOCK},                         // ASTC_2D_10X10_SRGB
+    {VK_FORMAT_ASTC_12x12_UNORM_BLOCK},                        // ASTC_2D_12X12_UNORM
+    {VK_FORMAT_ASTC_12x12_SRGB_BLOCK},                         // ASTC_2D_12X12_SRGB
+    {VK_FORMAT_ASTC_8x6_UNORM_BLOCK},                          // ASTC_2D_8X6_UNORM
+    {VK_FORMAT_ASTC_8x6_SRGB_BLOCK},                           // ASTC_2D_8X6_SRGB
+    {VK_FORMAT_ASTC_6x5_UNORM_BLOCK},                          // ASTC_2D_6X5_UNORM
+    {VK_FORMAT_ASTC_6x5_SRGB_BLOCK},                           // ASTC_2D_6X5_SRGB
+    {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32},                        // E5B9G9R9_FLOAT
 
     // Depth formats
-    {VK_FORMAT_D32_SFLOAT, Attachable}, // Z32F
-    {VK_FORMAT_D16_UNORM, Attachable},  // Z16
+    {VK_FORMAT_D32_SFLOAT, Attachable}, // D32_FLOAT
+    {VK_FORMAT_D16_UNORM, Attachable},  // D16_UNORM
 
     // DepthStencil formats
-    {VK_FORMAT_D24_UNORM_S8_UINT, Attachable},  // Z24S8
-    {VK_FORMAT_D24_UNORM_S8_UINT, Attachable},  // S8Z24 (emulated)
-    {VK_FORMAT_D32_SFLOAT_S8_UINT, Attachable}, // Z32FS8
+    {VK_FORMAT_D24_UNORM_S8_UINT, Attachable},  // D24_UNORM_S8_UINT
+    {VK_FORMAT_D24_UNORM_S8_UINT, Attachable},  // S8_UINT_D24_UNORM (emulated)
+    {VK_FORMAT_D32_SFLOAT_S8_UINT, Attachable}, // D32_FLOAT_S8_UINT
 };
 static_assert(std::size(tex_format_tuples) == VideoCore::Surface::MaxPixelFormat);
 
@@ -221,7 +233,7 @@ FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFo
         return {VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true};
     }
 
-    // Use ABGR8 on hardware that doesn't support ASTC natively
+    // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively
     if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) {
         tuple.format = VideoCore::Surface::IsPixelFormatSRGB(pixel_format)
                            ? VK_FORMAT_A8B8G8R8_SRGB_PACK32
@@ -295,6 +307,30 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
     switch (type) {
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return VK_FORMAT_R8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return VK_FORMAT_R8G8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return VK_FORMAT_R8G8B8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return VK_FORMAT_R8G8B8A8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        default:
+            break;
+        }
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -319,44 +355,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_UNORM;
+            return VK_FORMAT_R8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_UNORM;
+            return VK_FORMAT_R8G8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_UNORM;
+            return VK_FORMAT_R8G8B8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_UNORM;
+            return VK_FORMAT_R8G8B8A8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_UNORM;
+            return VK_FORMAT_R16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_UNORM;
+            return VK_FORMAT_R16G16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_UNORM;
+            return VK_FORMAT_R16G16B16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_UNORM;
+            return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
-            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
         default:
             break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SINT;
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SINT;
+            return VK_FORMAT_R8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SINT;
+            return VK_FORMAT_R8G8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SINT;
+            return VK_FORMAT_R8G8B8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32:
-            return VK_FORMAT_R32_SINT;
+            return VK_FORMAT_R8G8B8A8_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
         default:
             break;
         }
@@ -387,56 +429,54 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_UINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UINT_PACK32;
         default:
             break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_USCALED;
+            return VK_FORMAT_R8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_USCALED;
+            return VK_FORMAT_R8G8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_USCALED;
+            return VK_FORMAT_R8G8B8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_USCALED;
+            return VK_FORMAT_R8G8B8A8_SINT;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_USCALED;
+            return VK_FORMAT_R16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_USCALED;
+            return VK_FORMAT_R16G16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_USCALED;
+            return VK_FORMAT_R16G16B16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_USCALED;
+            return VK_FORMAT_R16G16B16A16_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+            return VK_FORMAT_R32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+            return VK_FORMAT_R32G32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return VK_FORMAT_R32G32B32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return VK_FORMAT_R32G32B32A32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SINT_PACK32;
         default:
             break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::Float:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SSCALED;
+            return VK_FORMAT_R16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SSCALED;
+            return VK_FORMAT_R16G16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SSCALED;
+            return VK_FORMAT_R16G16B16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SSCALED;
-        default:
-            break;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::Float:
-        switch (size) {
+            return VK_FORMAT_R16G16B16A16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -445,14 +485,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SFLOAT;
         default:
             break;
         }
@@ -672,4 +704,27 @@ VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle) {
     return {};
 }
 
+VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) {
+    switch (swizzle) {
+    case Maxwell::ViewportSwizzle::PositiveX:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV;
+    case Maxwell::ViewportSwizzle::NegativeX:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_X_NV;
+    case Maxwell::ViewportSwizzle::PositiveY:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Y_NV;
+    case Maxwell::ViewportSwizzle::NegativeY:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Y_NV;
+    case Maxwell::ViewportSwizzle::PositiveZ:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Z_NV;
+    case Maxwell::ViewportSwizzle::NegativeZ:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Z_NV;
+    case Maxwell::ViewportSwizzle::PositiveW:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_W_NV;
+    case Maxwell::ViewportSwizzle::NegativeW:
+        return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV;
+    }
+    UNREACHABLE_MSG("Invalid swizzle={}", static_cast<int>(swizzle));
+    return {};
+}
+
 } // namespace Vulkan::MaxwellToVK
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h
index 81bce4c6c..7e213452f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -59,4 +59,6 @@ VkCullModeFlags CullFace(Maxwell::CullFace cull_face);
 
 VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle);
 
+VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle);
+
 } // namespace Vulkan::MaxwellToVK
diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp
new file mode 100644
index 000000000..5b01020ec
--- /dev/null
+++ b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp
@@ -0,0 +1,220 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#ifdef HAS_NSIGHT_AFTERMATH
+
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include <fmt/format.h>
+
+#define VK_NO_PROTOTYPES
+#include <vulkan/vulkan.h>
+
+#include <GFSDK_Aftermath.h>
+#include <GFSDK_Aftermath_Defines.h>
+#include <GFSDK_Aftermath_GpuCrashDump.h>
+#include <GFSDK_Aftermath_GpuCrashDumpDecoding.h>
+
+#include "common/common_paths.h"
+#include "common/common_types.h"
+#include "common/file_util.h"
+#include "common/logging/log.h"
+#include "common/scope_exit.h"
+
+#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h"
+
+namespace Vulkan {
+
+static constexpr char AFTERMATH_LIB_NAME[] = "GFSDK_Aftermath_Lib.x64.dll";
+
+NsightAftermathTracker::NsightAftermathTracker() = default;
+
+NsightAftermathTracker::~NsightAftermathTracker() {
+    if (initialized) {
+        (void)GFSDK_Aftermath_DisableGpuCrashDumps();
+    }
+}
+
+bool NsightAftermathTracker::Initialize() {
+    if (!dl.Open(AFTERMATH_LIB_NAME)) {
+        LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath DLL");
+        return false;
+    }
+
+    if (!dl.GetSymbol("GFSDK_Aftermath_DisableGpuCrashDumps",
+                      &GFSDK_Aftermath_DisableGpuCrashDumps) ||
+        !dl.GetSymbol("GFSDK_Aftermath_EnableGpuCrashDumps",
+                      &GFSDK_Aftermath_EnableGpuCrashDumps) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GetShaderDebugInfoIdentifier",
+                      &GFSDK_Aftermath_GetShaderDebugInfoIdentifier) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GetShaderHashSpirv", &GFSDK_Aftermath_GetShaderHashSpirv) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_CreateDecoder",
+                      &GFSDK_Aftermath_GpuCrashDump_CreateDecoder) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_DestroyDecoder",
+                      &GFSDK_Aftermath_GpuCrashDump_DestroyDecoder) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_GenerateJSON",
+                      &GFSDK_Aftermath_GpuCrashDump_GenerateJSON) ||
+        !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_GetJSON",
+                      &GFSDK_Aftermath_GpuCrashDump_GetJSON)) {
+        LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath function pointers");
+        return false;
+    }
+
+    dump_dir = Common::FS::GetUserPath(Common::FS::UserPath::LogDir) + "gpucrash";
+
+    (void)Common::FS::DeleteDirRecursively(dump_dir);
+    if (!Common::FS::CreateDir(dump_dir)) {
+        LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory");
+        return false;
+    }
+
+    if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_EnableGpuCrashDumps(
+            GFSDK_Aftermath_Version_API, GFSDK_Aftermath_GpuCrashDumpWatchedApiFlags_Vulkan,
+            GFSDK_Aftermath_GpuCrashDumpFeatureFlags_Default, GpuCrashDumpCallback,
+            ShaderDebugInfoCallback, CrashDumpDescriptionCallback, this))) {
+        LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_EnableGpuCrashDumps failed");
+        return false;
+    }
+
+    LOG_INFO(Render_Vulkan, "Nsight Aftermath dump directory is \"{}\"", dump_dir);
+
+    initialized = true;
+    return true;
+}
+
+void NsightAftermathTracker::SaveShader(const std::vector<u32>& spirv) const {
+    if (!initialized) {
+        return;
+    }
+
+    std::vector<u32> spirv_copy = spirv;
+    GFSDK_Aftermath_SpirvCode shader;
+    shader.pData = spirv_copy.data();
+    shader.size = static_cast<u32>(spirv_copy.size() * 4);
+
+    std::scoped_lock lock{mutex};
+
+    GFSDK_Aftermath_ShaderHash hash;
+    if (!GFSDK_Aftermath_SUCCEED(
+            GFSDK_Aftermath_GetShaderHashSpirv(GFSDK_Aftermath_Version_API, &shader, &hash))) {
+        LOG_ERROR(Render_Vulkan, "Failed to hash SPIR-V module");
+        return;
+    }
+
+    Common::FS::IOFile file(fmt::format("{}/source_{:016x}.spv", dump_dir, hash.hash), "wb");
+    if (!file.IsOpen()) {
+        LOG_ERROR(Render_Vulkan, "Failed to dump SPIR-V module with hash={:016x}", hash.hash);
+        return;
+    }
+    if (file.WriteArray(spirv.data(), spirv.size()) != spirv.size()) {
+        LOG_ERROR(Render_Vulkan, "Failed to write SPIR-V module with hash={:016x}", hash.hash);
+        return;
+    }
+}
+
+void NsightAftermathTracker::OnGpuCrashDumpCallback(const void* gpu_crash_dump,
+                                                    u32 gpu_crash_dump_size) {
+    std::scoped_lock lock{mutex};
+
+    LOG_CRITICAL(Render_Vulkan, "called");
+
+    GFSDK_Aftermath_GpuCrashDump_Decoder decoder;
+    if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GpuCrashDump_CreateDecoder(
+            GFSDK_Aftermath_Version_API, gpu_crash_dump, gpu_crash_dump_size, &decoder))) {
+        LOG_ERROR(Render_Vulkan, "Failed to create decoder");
+        return;
+    }
+    SCOPE_EXIT({ GFSDK_Aftermath_GpuCrashDump_DestroyDecoder(decoder); });
+
+    u32 json_size = 0;
+    if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GpuCrashDump_GenerateJSON(
+            decoder, GFSDK_Aftermath_GpuCrashDumpDecoderFlags_ALL_INFO,
+            GFSDK_Aftermath_GpuCrashDumpFormatterFlags_NONE, nullptr, nullptr, nullptr, nullptr,
+            this, &json_size))) {
+        LOG_ERROR(Render_Vulkan, "Failed to generate JSON");
+        return;
+    }
+    std::vector<char> json(json_size);
+    if (!GFSDK_Aftermath_SUCCEED(
+            GFSDK_Aftermath_GpuCrashDump_GetJSON(decoder, json_size, json.data()))) {
+        LOG_ERROR(Render_Vulkan, "Failed to query JSON");
+        return;
+    }
+
+    const std::string base_name = [this] {
+        const int id = dump_id++;
+        if (id == 0) {
+            return fmt::format("{}/crash.nv-gpudmp", dump_dir);
+        } else {
+            return fmt::format("{}/crash_{}.nv-gpudmp", dump_dir, id);
+        }
+    }();
+
+    std::string_view dump_view(static_cast<const char*>(gpu_crash_dump), gpu_crash_dump_size);
+    if (Common::FS::WriteStringToFile(false, base_name, dump_view) != gpu_crash_dump_size) {
+        LOG_ERROR(Render_Vulkan, "Failed to write dump file");
+        return;
+    }
+    const std::string_view json_view(json.data(), json.size());
+    if (Common::FS::WriteStringToFile(true, base_name + ".json", json_view) != json.size()) {
+        LOG_ERROR(Render_Vulkan, "Failed to write JSON");
+        return;
+    }
+}
+
+void NsightAftermathTracker::OnShaderDebugInfoCallback(const void* shader_debug_info,
+                                                       u32 shader_debug_info_size) {
+    std::scoped_lock lock{mutex};
+
+    GFSDK_Aftermath_ShaderDebugInfoIdentifier identifier;
+    if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GetShaderDebugInfoIdentifier(
+            GFSDK_Aftermath_Version_API, shader_debug_info, shader_debug_info_size, &identifier))) {
+        LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_GetShaderDebugInfoIdentifier failed");
+        return;
+    }
+
+    const std::string path =
+        fmt::format("{}/shader_{:016x}{:016x}.nvdbg", dump_dir, identifier.id[0], identifier.id[1]);
+    Common::FS::IOFile file(path, "wb");
+    if (!file.IsOpen()) {
+        LOG_ERROR(Render_Vulkan, "Failed to create file {}", path);
+        return;
+    }
+    if (file.WriteBytes(static_cast<const u8*>(shader_debug_info), shader_debug_info_size) !=
+        shader_debug_info_size) {
+        LOG_ERROR(Render_Vulkan, "Failed to write file {}", path);
+        return;
+    }
+}
+
+void NsightAftermathTracker::OnCrashDumpDescriptionCallback(
+    PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description) {
+    add_description(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationName, "yuzu");
+}
+
+void NsightAftermathTracker::GpuCrashDumpCallback(const void* gpu_crash_dump,
+                                                  u32 gpu_crash_dump_size, void* user_data) {
+    static_cast<NsightAftermathTracker*>(user_data)->OnGpuCrashDumpCallback(gpu_crash_dump,
+                                                                            gpu_crash_dump_size);
+}
+
+void NsightAftermathTracker::ShaderDebugInfoCallback(const void* shader_debug_info,
+                                                     u32 shader_debug_info_size, void* user_data) {
+    static_cast<NsightAftermathTracker*>(user_data)->OnShaderDebugInfoCallback(
+        shader_debug_info, shader_debug_info_size);
+}
+
+void NsightAftermathTracker::CrashDumpDescriptionCallback(
+    PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description, void* user_data) {
+    static_cast<NsightAftermathTracker*>(user_data)->OnCrashDumpDescriptionCallback(
+        add_description);
+}
+
+} // namespace Vulkan
+
+#endif // HAS_NSIGHT_AFTERMATH
diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h
new file mode 100644
index 000000000..afe7ae99e
--- /dev/null
+++ b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h
@@ -0,0 +1,87 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#define VK_NO_PROTOTYPES
+#include <vulkan/vulkan.h>
+
+#ifdef HAS_NSIGHT_AFTERMATH
+#include <GFSDK_Aftermath_Defines.h>
+#include <GFSDK_Aftermath_GpuCrashDump.h>
+#include <GFSDK_Aftermath_GpuCrashDumpDecoding.h>
+#endif
+
+#include "common/common_types.h"
+#include "common/dynamic_library.h"
+
+namespace Vulkan {
+
+class NsightAftermathTracker {
+public:
+    NsightAftermathTracker();
+    ~NsightAftermathTracker();
+
+    NsightAftermathTracker(const NsightAftermathTracker&) = delete;
+    NsightAftermathTracker& operator=(const NsightAftermathTracker&) = delete;
+
+    // Delete move semantics because Aftermath initialization uses a pointer to this.
+    NsightAftermathTracker(NsightAftermathTracker&&) = delete;
+    NsightAftermathTracker& operator=(NsightAftermathTracker&&) = delete;
+
+    bool Initialize();
+
+    void SaveShader(const std::vector<u32>& spirv) const;
+
+private:
+#ifdef HAS_NSIGHT_AFTERMATH
+    static void GpuCrashDumpCallback(const void* gpu_crash_dump, u32 gpu_crash_dump_size,
+                                     void* user_data);
+
+    static void ShaderDebugInfoCallback(const void* shader_debug_info, u32 shader_debug_info_size,
+                                        void* user_data);
+
+    static void CrashDumpDescriptionCallback(
+        PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description, void* user_data);
+
+    void OnGpuCrashDumpCallback(const void* gpu_crash_dump, u32 gpu_crash_dump_size);
+
+    void OnShaderDebugInfoCallback(const void* shader_debug_info, u32 shader_debug_info_size);
+
+    void OnCrashDumpDescriptionCallback(
+        PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description);
+
+    mutable std::mutex mutex;
+
+    std::string dump_dir;
+    int dump_id = 0;
+
+    bool initialized = false;
+
+    Common::DynamicLibrary dl;
+    PFN_GFSDK_Aftermath_DisableGpuCrashDumps GFSDK_Aftermath_DisableGpuCrashDumps;
+    PFN_GFSDK_Aftermath_EnableGpuCrashDumps GFSDK_Aftermath_EnableGpuCrashDumps;
+    PFN_GFSDK_Aftermath_GetShaderDebugInfoIdentifier GFSDK_Aftermath_GetShaderDebugInfoIdentifier;
+    PFN_GFSDK_Aftermath_GetShaderHashSpirv GFSDK_Aftermath_GetShaderHashSpirv;
+    PFN_GFSDK_Aftermath_GpuCrashDump_CreateDecoder GFSDK_Aftermath_GpuCrashDump_CreateDecoder;
+    PFN_GFSDK_Aftermath_GpuCrashDump_DestroyDecoder GFSDK_Aftermath_GpuCrashDump_DestroyDecoder;
+    PFN_GFSDK_Aftermath_GpuCrashDump_GenerateJSON GFSDK_Aftermath_GpuCrashDump_GenerateJSON;
+    PFN_GFSDK_Aftermath_GpuCrashDump_GetJSON GFSDK_Aftermath_GpuCrashDump_GetJSON;
+#endif
+};
+
+#ifndef HAS_NSIGHT_AFTERMATH
+inline NsightAftermathTracker::NsightAftermathTracker() = default;
+inline NsightAftermathTracker::~NsightAftermathTracker() = default;
+inline bool NsightAftermathTracker::Initialize() {
+    return false;
+}
+inline void NsightAftermathTracker::SaveShader(const std::vector<u32>&) const {}
+#endif
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index dd590c38b..f2610868e 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -12,24 +12,22 @@
 
 #include <fmt/format.h>
 
-#include "common/assert.h"
 #include "common/dynamic_library.h"
+#include "common/file_util.h"
 #include "common/logging/log.h"
 #include "common/telemetry.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
-#include "core/memory.h"
-#include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
 #include "video_core/gpu.h"
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
@@ -42,7 +40,7 @@
 #include <vulkan/vulkan_win32.h>
 #endif
 
-#ifdef __linux__
+#if !defined(_WIN32) && !defined(__APPLE__)
 #include <X11/Xlib.h>
 #include <vulkan/vulkan_wayland.h>
 #include <vulkan/vulkan_xlib.h>
@@ -58,7 +56,7 @@ VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
                        VkDebugUtilsMessageTypeFlagsEXT type,
                        const VkDebugUtilsMessengerCallbackDataEXT* data,
                        [[maybe_unused]] void* user_data) {
-    const char* message{data->pMessage};
+    const char* const message{data->pMessage};
 
     if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
         LOG_CRITICAL(Render_Vulkan, "{}", message);
@@ -79,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() {
     char* libvulkan_env = getenv("LIBVULKAN_PATH");
     if (!libvulkan_env || !library.Open(libvulkan_env)) {
         // Use the libvulkan.dylib from the application bundle.
-        std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
+        const std::string filename =
+            Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
         library.Open(filename.c_str());
     }
 #else
@@ -87,15 +86,15 @@ Common::DynamicLibrary OpenVulkanLibrary() {
     if (!library.Open(filename.c_str())) {
         // Android devices may not have libvulkan.so.1, only libvulkan.so.
         filename = Common::DynamicLibrary::GetVersionedFilename("vulkan");
-        library.Open(filename.c_str());
+        (void)library.Open(filename.c_str());
     }
 #endif
     return library;
 }
 
-vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatch& dld,
-                            WindowSystemType window_type = WindowSystemType::Headless,
-                            bool enable_layers = false) {
+std::pair<vk::Instance, u32> CreateInstance(
+    Common::DynamicLibrary& library, vk::InstanceDispatch& dld,
+    WindowSystemType window_type = WindowSystemType::Headless, bool enable_layers = false) {
     if (!library.IsOpen()) {
         LOG_ERROR(Render_Vulkan, "Vulkan library not available");
         return {};
@@ -119,7 +118,7 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME);
         break;
 #endif
-#ifdef __linux__
+#if !defined(_WIN32) && !defined(__APPLE__)
     case Core::Frontend::WindowSystemType::X11:
         extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME);
         break;
@@ -156,12 +155,35 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
-    vk::Span<const char*> layers = layers_data;
-    if (!enable_layers) {
-        layers = {};
+    std::vector<const char*> layers;
+    layers.reserve(1);
+    if (enable_layers) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+
+    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+    if (!layer_properties) {
+        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+        layers.clear();
+    }
+
+    for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+        const char* const layer = *layer_it;
+        const auto it = std::find_if(
+            layer_properties->begin(), layer_properties->end(),
+            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+        if (it == layer_properties->end()) {
+            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+            layer_it = layers.erase(layer_it);
+        } else {
+            ++layer_it;
+        }
     }
-    vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
+
+    // Limit the maximum version of Vulkan to avoid using untested version.
+    const u32 version = std::min(vk::AvailableVersion(dld), static_cast<u32>(VK_API_VERSION_1_1));
+
+    vk::Instance instance = vk::Instance::Create(version, layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
         return {};
@@ -169,7 +191,7 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
     if (!vk::Load(*instance, dld)) {
         LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers");
     }
-    return instance;
+    return std::make_pair(std::move(instance), version);
 }
 
 std::string GetReadableVersion(u32 version) {
@@ -218,8 +240,12 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext
 
 } // Anonymous namespace
 
-RendererVulkan::RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system)
-    : RendererBase(window), system{system} {}
+RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
+                               Core::Frontend::EmuWindow& emu_window,
+                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
+                               std::unique_ptr<Core::Frontend::GraphicsContext> context)
+    : RendererBase{emu_window, std::move(context)}, telemetry_session{telemetry_session_},
+      cpu_memory{cpu_memory_}, gpu{gpu_} {}
 
 RendererVulkan::~RendererVulkan() {
     ShutDown();
@@ -246,11 +272,11 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
         scheduler->WaitWorker();
 
         swapchain->AcquireNextImage();
-        const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated);
+        const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated);
 
-        scheduler->Flush(false, render_semaphore);
+        scheduler->Flush(render_semaphore);
 
-        if (swapchain->Present(render_semaphore, fence)) {
+        if (swapchain->Present(render_semaphore)) {
             blit_screen->Recreate();
         }
 
@@ -260,15 +286,10 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     render_window.PollEvents();
 }
 
-bool RendererVulkan::TryPresent(int /*timeout_ms*/) {
-    // TODO (bunnei): ImplementMe
-    return true;
-}
-
 bool RendererVulkan::Init() {
     library = OpenVulkanLibrary();
-    instance = CreateInstance(library, dld, render_window.GetWindowInfo().type,
-                              Settings::values.renderer_debug);
+    std::tie(instance, instance_version) = CreateInstance(
+        library, dld, render_window.GetWindowInfo().type, Settings::values.renderer_debug);
     if (!instance || !CreateDebugCallback() || !CreateSurface() || !PickDevices()) {
         return false;
     }
@@ -277,23 +298,21 @@ bool RendererVulkan::Init() {
 
     memory_manager = std::make_unique<VKMemoryManager>(*device);
 
-    resource_manager = std::make_unique<VKResourceManager>(*device);
+    state_tracker = std::make_unique<StateTracker>(gpu);
+
+    scheduler = std::make_unique<VKScheduler>(*device, *state_tracker);
 
     const auto& framebuffer = render_window.GetFramebufferLayout();
-    swapchain = std::make_unique<VKSwapchain>(*surface, *device);
+    swapchain = std::make_unique<VKSwapchain>(*surface, *device, *scheduler);
     swapchain->Create(framebuffer.width, framebuffer.height, false);
 
-    state_tracker = std::make_unique<StateTracker>(system);
-
-    scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker);
-
-    rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device,
-                                                    *resource_manager, *memory_manager,
-                                                    *state_tracker, *scheduler);
+    rasterizer = std::make_unique<RasterizerVulkan>(render_window, gpu, gpu.MemoryManager(),
+                                                    cpu_memory, screen_info, *device,
+                                                    *memory_manager, *state_tracker, *scheduler);
 
-    blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device,
-                                                 *resource_manager, *memory_manager, *swapchain,
-                                                 *scheduler, screen_info);
+    blit_screen =
+        std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device,
+                                       *memory_manager, *swapchain, *scheduler, screen_info);
 
     return true;
 }
@@ -311,7 +330,6 @@ void RendererVulkan::ShutDown() {
     scheduler.reset();
     swapchain.reset();
     memory_manager.reset();
-    resource_manager.reset();
     device.reset();
 }
 
@@ -345,7 +363,7 @@ bool RendererVulkan::CreateSurface() {
         }
     }
 #endif
-#ifdef __linux__
+#if !defined(_WIN32) && !defined(__APPLE__)
     if (window_info.type == Core::Frontend::WindowSystemType::X11) {
         const VkXlibSurfaceCreateInfoKHR xlib_ci{
             VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0,
@@ -390,7 +408,7 @@ bool RendererVulkan::PickDevices() {
         return false;
     }
 
-    const s32 device_index = Settings::values.vulkan_device;
+    const s32 device_index = Settings::values.vulkan_device.GetValue();
     if (device_index < 0 || device_index >= static_cast<s32>(devices->size())) {
         LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
         return false;
@@ -401,7 +419,8 @@ bool RendererVulkan::PickDevices() {
         return false;
     }
 
-    device = std::make_unique<VKDevice>(*instance, physical_device, *surface, dld);
+    device =
+        std::make_unique<VKDevice>(*instance, instance_version, physical_device, *surface, dld);
     return device->Create();
 }
 
@@ -411,7 +430,7 @@ void RendererVulkan::Report() const {
     const std::string driver_version = GetDriverVersion(*device);
     const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version);
 
-    const std::string api_version = GetReadableVersion(device->GetApiVersion());
+    const std::string api_version = GetReadableVersion(device->ApiVersion());
 
     const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions());
 
@@ -419,8 +438,7 @@ void RendererVulkan::Report() const {
     LOG_INFO(Render_Vulkan, "Device: {}", model_name);
     LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version);
 
-    auto& telemetry_session = system.TelemetrySession();
-    constexpr auto field = Telemetry::FieldType::UserSystem;
+    static constexpr auto field = Common::Telemetry::FieldType::UserSystem;
     telemetry_session.AddField(field, "GPU_Vendor", vendor_name);
     telemetry_session.AddField(field, "GPU_Model", model_name);
     telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name);
@@ -431,7 +449,7 @@ void RendererVulkan::Report() const {
 std::vector<std::string> RendererVulkan::EnumerateDevices() {
     vk::InstanceDispatch dld;
     Common::DynamicLibrary library = OpenVulkanLibrary();
-    vk::Instance instance = CreateInstance(library, dld);
+    vk::Instance instance = CreateInstance(library, dld).first;
     if (!instance) {
         return {};
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 18270909b..1044ca124 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -5,7 +5,6 @@
 #pragma once
 
 #include <memory>
-#include <optional>
 #include <string>
 #include <vector>
 
@@ -15,7 +14,15 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Core {
-class System;
+class TelemetrySession;
+}
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Tegra {
+class GPU;
 }
 
 namespace Vulkan {
@@ -23,9 +30,7 @@ namespace Vulkan {
 class StateTracker;
 class VKBlitScreen;
 class VKDevice;
-class VKFence;
 class VKMemoryManager;
-class VKResourceManager;
 class VKSwapchain;
 class VKScheduler;
 class VKImage;
@@ -39,13 +44,15 @@ struct VKScreenInfo {
 
 class RendererVulkan final : public VideoCore::RendererBase {
 public:
-    explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system);
+    explicit RendererVulkan(Core::TelemetrySession& telemtry_session,
+                            Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory,
+                            Tegra::GPU& gpu,
+                            std::unique_ptr<Core::Frontend::GraphicsContext> context);
     ~RendererVulkan() override;
 
     bool Init() override;
     void ShutDown() override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    bool TryPresent(int timeout_ms) override;
 
     static std::vector<std::string> EnumerateDevices();
 
@@ -58,23 +65,26 @@ private:
 
     void Report() const;
 
-    Core::System& system;
+    Core::TelemetrySession& telemetry_session;
+    Core::Memory::Memory& cpu_memory;
+    Tegra::GPU& gpu;
 
     Common::DynamicLibrary library;
     vk::InstanceDispatch dld;
 
     vk::Instance instance;
+    u32 instance_version{};
+
     vk::SurfaceKHR surface;
 
     VKScreenInfo screen_info;
 
     vk::DebugCallback debug_callback;
     std::unique_ptr<VKDevice> device;
-    std::unique_ptr<VKSwapchain> swapchain;
     std::unique_ptr<VKMemoryManager> memory_manager;
-    std::unique_ptr<VKResourceManager> resource_manager;
     std::unique_ptr<StateTracker> state_tracker;
     std::unique_ptr<VKScheduler> scheduler;
+    std::unique_ptr<VKSwapchain> swapchain;
     std::unique_ptr<VKBlitScreen> blit_screen;
 };
 
diff --git a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp b/src/video_core/renderer_vulkan/shaders/quad_indexed.comp
new file mode 100644
index 000000000..5a472ba9b
--- /dev/null
+++ b/src/video_core/renderer_vulkan/shaders/quad_indexed.comp
@@ -0,0 +1,50 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+/*
+ * Build instructions:
+ * $ glslangValidator -V quad_indexed.comp -o output.spv
+ * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
+ * $ xxd -i optimized.spv
+ *
+ * Then copy that bytecode to the C++ file
+ */
+
+#version 460 core
+
+layout (local_size_x = 1024) in;
+
+layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint input_indexes[];
+};
+
+layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
+    uint output_indexes[];
+};
+
+layout (push_constant) uniform PushConstants {
+    uint base_vertex;
+    int index_shift; // 0: uint8, 1: uint16, 2: uint32
+};
+
+void main() {
+    int primitive = int(gl_GlobalInvocationID.x);
+    if (primitive * 6 >= output_indexes.length()) {
+        return;
+    }
+
+    int index_size = 8 << index_shift;
+    int flipped_shift = 2 - index_shift;
+    int mask = (1 << flipped_shift) - 1;
+
+    const int quad_swizzle[6] = int[](0, 1, 2, 0, 2, 3);
+    for (uint vertex = 0; vertex < 6; ++vertex) {
+        int offset = primitive * 4 + quad_swizzle[vertex];
+        int int_offset = offset >> flipped_shift;
+        int bit_offset = (offset & mask) * index_size;
+        uint packed_input = input_indexes[int_offset];
+        uint index = bitfieldExtract(packed_input, bit_offset, index_size);
+        output_indexes[primitive * 6 + vertex] = index + base_vertex;
+    }
+}
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index fbd406f2b..b5b60309e 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -12,11 +12,9 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
-
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/memory.h"
-
 #include "video_core/gpu.h"
 #include "video_core/morton.h"
 #include "video_core/rasterizer_interface.h"
@@ -24,8 +22,8 @@
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_image.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_shader_util.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
@@ -141,24 +139,28 @@ struct ScreenRectVertex {
     std::array<f32, 2> tex_coord;
 
     static VkVertexInputBindingDescription GetDescription() {
-        VkVertexInputBindingDescription description;
-        description.binding = 0;
-        description.stride = sizeof(ScreenRectVertex);
-        description.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-        return description;
+        return {
+            .binding = 0,
+            .stride = sizeof(ScreenRectVertex),
+            .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+        };
     }
 
     static std::array<VkVertexInputAttributeDescription, 2> GetAttributes() {
-        std::array<VkVertexInputAttributeDescription, 2> attributes;
-        attributes[0].location = 0;
-        attributes[0].binding = 0;
-        attributes[0].format = VK_FORMAT_R32G32_SFLOAT;
-        attributes[0].offset = offsetof(ScreenRectVertex, position);
-        attributes[1].location = 1;
-        attributes[1].binding = 0;
-        attributes[1].format = VK_FORMAT_R32G32_SFLOAT;
-        attributes[1].offset = offsetof(ScreenRectVertex, tex_coord);
-        return attributes;
+        return {{
+            {
+                .location = 0,
+                .binding = 0,
+                .format = VK_FORMAT_R32G32_SFLOAT,
+                .offset = offsetof(ScreenRectVertex, position),
+            },
+            {
+                .location = 1,
+                .binding = 0,
+                .format = VK_FORMAT_R32G32_SFLOAT,
+                .offset = offsetof(ScreenRectVertex, tex_coord),
+            },
+        }};
     }
 };
 
@@ -183,9 +185,9 @@ std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) {
 
 VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) {
     switch (framebuffer.pixel_format) {
-    case Tegra::FramebufferConfig::PixelFormat::ABGR8:
+    case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM:
         return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
-    case Tegra::FramebufferConfig::PixelFormat::RGB565:
+    case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM:
         return VK_FORMAT_R5G6B5_UNORM_PACK16;
     default:
         UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}",
@@ -206,17 +208,15 @@ struct VKBlitScreen::BufferData {
     // Unaligned image data goes here
 };
 
-VKBlitScreen::VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window,
-                           VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
-                           VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                           VKSwapchain& swapchain, VKScheduler& scheduler,
-                           const VKScreenInfo& screen_info)
-    : system{system}, render_window{render_window}, rasterizer{rasterizer}, device{device},
-      resource_manager{resource_manager}, memory_manager{memory_manager}, swapchain{swapchain},
-      scheduler{scheduler}, image_count{swapchain.GetImageCount()}, screen_info{screen_info} {
-    watches.resize(image_count);
-    std::generate(watches.begin(), watches.end(),
-                  []() { return std::make_unique<VKFenceWatch>(); });
+VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_,
+                           Core::Frontend::EmuWindow& render_window_,
+                           VideoCore::RasterizerInterface& rasterizer_, const VKDevice& device_,
+                           VKMemoryManager& memory_manager_, VKSwapchain& swapchain_,
+                           VKScheduler& scheduler_, const VKScreenInfo& screen_info_)
+    : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_},
+      device{device_}, memory_manager{memory_manager_}, swapchain{swapchain_},
+      scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} {
+    resource_ticks.resize(image_count);
 
     CreateStaticResources();
     CreateDynamicResources();
@@ -228,15 +228,16 @@ void VKBlitScreen::Recreate() {
     CreateDynamicResources();
 }
 
-std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
-                                                     bool use_accelerated) {
+VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated) {
     RefreshResources(framebuffer);
 
     // Finish any pending renderpass
     scheduler.RequestOutsideRenderPassOperationContext();
 
     const std::size_t image_index = swapchain.GetImageIndex();
-    watches[image_index]->Watch(scheduler.GetFence());
+
+    scheduler.Wait(resource_ticks[image_index]);
+    resource_ticks[image_index] = scheduler.CurrentTick();
 
     VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get();
 
@@ -255,7 +256,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
         const auto pixel_format =
             VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format);
         const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
-        const auto host_ptr = system.Memory().GetPointer(framebuffer_addr);
+        const auto host_ptr = cpu_memory.GetPointer(framebuffer_addr);
         rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer));
 
         // TODO(Rodrigo): Read this from HLE
@@ -267,20 +268,25 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
         blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_TRANSFER_BIT,
                                VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 
-        VkBufferImageCopy copy;
-        copy.bufferOffset = image_offset;
-        copy.bufferRowLength = 0;
-        copy.bufferImageHeight = 0;
-        copy.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-        copy.imageSubresource.mipLevel = 0;
-        copy.imageSubresource.baseArrayLayer = 0;
-        copy.imageSubresource.layerCount = 1;
-        copy.imageOffset.x = 0;
-        copy.imageOffset.y = 0;
-        copy.imageOffset.z = 0;
-        copy.imageExtent.width = framebuffer.width;
-        copy.imageExtent.height = framebuffer.height;
-        copy.imageExtent.depth = 1;
+        const VkBufferImageCopy copy{
+            .bufferOffset = image_offset,
+            .bufferRowLength = 0,
+            .bufferImageHeight = 0,
+            .imageSubresource =
+                {
+                    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                    .mipLevel = 0,
+                    .baseArrayLayer = 0,
+                    .layerCount = 1,
+                },
+            .imageOffset = {.x = 0, .y = 0, .z = 0},
+            .imageExtent =
+                {
+                    .width = framebuffer.width,
+                    .height = framebuffer.height,
+                    .depth = 1,
+                },
+        };
         scheduler.Record(
             [buffer = *buffer, image = *blit_image->GetHandle(), copy](vk::CommandBuffer cmdbuf) {
                 cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
@@ -295,11 +301,9 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
                       descriptor_set = descriptor_sets[image_index], buffer = *buffer,
                       size = swapchain.GetSize(), pipeline = *pipeline,
                       layout = *pipeline_layout](vk::CommandBuffer cmdbuf) {
-        VkClearValue clear_color;
-        clear_color.color.float32[0] = 0.0f;
-        clear_color.color.float32[1] = 0.0f;
-        clear_color.color.float32[2] = 0.0f;
-        clear_color.color.float32[3] = 0.0f;
+        const VkClearValue clear_color{
+            .color = {.float32 = {0.0f, 0.0f, 0.0f, 0.0f}},
+        };
 
         VkRenderPassBeginInfo renderpass_bi;
         renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
@@ -336,7 +340,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon
         cmdbuf.EndRenderPass();
     });
 
-    return {scheduler.GetFence(), *semaphores[image_index]};
+    return *semaphores[image_index];
 }
 
 void VKBlitScreen::CreateStaticResources() {
@@ -379,93 +383,109 @@ void VKBlitScreen::CreateSemaphores() {
 }
 
 void VKBlitScreen::CreateDescriptorPool() {
-    std::array<VkDescriptorPoolSize, 2> pool_sizes;
-    pool_sizes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    pool_sizes[0].descriptorCount = static_cast<u32>(image_count);
-    pool_sizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    pool_sizes[1].descriptorCount = static_cast<u32>(image_count);
-
-    VkDescriptorPoolCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-    ci.maxSets = static_cast<u32>(image_count);
-    ci.poolSizeCount = static_cast<u32>(pool_sizes.size());
-    ci.pPoolSizes = pool_sizes.data();
+    const std::array<VkDescriptorPoolSize, 2> pool_sizes{{
+        {
+            .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .descriptorCount = static_cast<u32>(image_count),
+        },
+        {
+            .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = static_cast<u32>(image_count),
+        },
+    }};
+
+    const VkDescriptorPoolCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+        .maxSets = static_cast<u32>(image_count),
+        .poolSizeCount = static_cast<u32>(pool_sizes.size()),
+        .pPoolSizes = pool_sizes.data(),
+    };
     descriptor_pool = device.GetLogical().CreateDescriptorPool(ci);
 }
 
 void VKBlitScreen::CreateRenderPass() {
-    VkAttachmentDescription color_attachment;
-    color_attachment.flags = 0;
-    color_attachment.format = swapchain.GetImageFormat();
-    color_attachment.samples = VK_SAMPLE_COUNT_1_BIT;
-    color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
-    color_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
-    color_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-    color_attachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-    color_attachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    color_attachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
-
-    VkAttachmentReference color_attachment_ref;
-    color_attachment_ref.attachment = 0;
-    color_attachment_ref.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-
-    VkSubpassDescription subpass_description;
-    subpass_description.flags = 0;
-    subpass_description.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-    subpass_description.inputAttachmentCount = 0;
-    subpass_description.pInputAttachments = nullptr;
-    subpass_description.colorAttachmentCount = 1;
-    subpass_description.pColorAttachments = &color_attachment_ref;
-    subpass_description.pResolveAttachments = nullptr;
-    subpass_description.pDepthStencilAttachment = nullptr;
-    subpass_description.preserveAttachmentCount = 0;
-    subpass_description.pPreserveAttachments = nullptr;
-
-    VkSubpassDependency dependency;
-    dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
-    dependency.dstSubpass = 0;
-    dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.srcAccessMask = 0;
-    dependency.dstAccessMask =
-        VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-    dependency.dependencyFlags = 0;
-
-    VkRenderPassCreateInfo renderpass_ci;
-    renderpass_ci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
-    renderpass_ci.pNext = nullptr;
-    renderpass_ci.flags = 0;
-    renderpass_ci.attachmentCount = 1;
-    renderpass_ci.pAttachments = &color_attachment;
-    renderpass_ci.subpassCount = 1;
-    renderpass_ci.pSubpasses = &subpass_description;
-    renderpass_ci.dependencyCount = 1;
-    renderpass_ci.pDependencies = &dependency;
+    const VkAttachmentDescription color_attachment{
+        .flags = 0,
+        .format = swapchain.GetImageFormat(),
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
+        .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+        .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+        .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+        .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+    };
+
+    const VkAttachmentReference color_attachment_ref{
+        .attachment = 0,
+        .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+    };
+
+    const VkSubpassDescription subpass_description{
+        .flags = 0,
+        .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        .inputAttachmentCount = 0,
+        .pInputAttachments = nullptr,
+        .colorAttachmentCount = 1,
+        .pColorAttachments = &color_attachment_ref,
+        .pResolveAttachments = nullptr,
+        .pDepthStencilAttachment = nullptr,
+        .preserveAttachmentCount = 0,
+        .pPreserveAttachments = nullptr,
+    };
+
+    const VkSubpassDependency dependency{
+        .srcSubpass = VK_SUBPASS_EXTERNAL,
+        .dstSubpass = 0,
+        .srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        .dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        .srcAccessMask = 0,
+        .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+        .dependencyFlags = 0,
+    };
+
+    const VkRenderPassCreateInfo renderpass_ci{
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .attachmentCount = 1,
+        .pAttachments = &color_attachment,
+        .subpassCount = 1,
+        .pSubpasses = &subpass_description,
+        .dependencyCount = 1,
+        .pDependencies = &dependency,
+    };
 
     renderpass = device.GetLogical().CreateRenderPass(renderpass_ci);
 }
 
 void VKBlitScreen::CreateDescriptorSetLayout() {
-    std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings;
-    layout_bindings[0].binding = 0;
-    layout_bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    layout_bindings[0].descriptorCount = 1;
-    layout_bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
-    layout_bindings[0].pImmutableSamplers = nullptr;
-    layout_bindings[1].binding = 1;
-    layout_bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    layout_bindings[1].descriptorCount = 1;
-    layout_bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
-    layout_bindings[1].pImmutableSamplers = nullptr;
-
-    VkDescriptorSetLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.bindingCount = static_cast<u32>(layout_bindings.size());
-    ci.pBindings = layout_bindings.data();
+    const std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings{{
+        {
+            .binding = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_VERTEX_BIT,
+            .pImmutableSamplers = nullptr,
+        },
+        {
+            .binding = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+            .pImmutableSamplers = nullptr,
+        },
+    }};
+
+    const VkDescriptorSetLayoutCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .bindingCount = static_cast<u32>(layout_bindings.size()),
+        .pBindings = layout_bindings.data(),
+    };
 
     descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci);
 }
@@ -473,175 +493,192 @@ void VKBlitScreen::CreateDescriptorSetLayout() {
 void VKBlitScreen::CreateDescriptorSets() {
     const std::vector layouts(image_count, *descriptor_set_layout);
 
-    VkDescriptorSetAllocateInfo ai;
-    ai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
-    ai.pNext = nullptr;
-    ai.descriptorPool = *descriptor_pool;
-    ai.descriptorSetCount = static_cast<u32>(image_count);
-    ai.pSetLayouts = layouts.data();
+    const VkDescriptorSetAllocateInfo ai{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .descriptorPool = *descriptor_pool,
+        .descriptorSetCount = static_cast<u32>(image_count),
+        .pSetLayouts = layouts.data(),
+    };
+
     descriptor_sets = descriptor_pool.Allocate(ai);
 }
 
 void VKBlitScreen::CreatePipelineLayout() {
-    VkPipelineLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.setLayoutCount = 1;
-    ci.pSetLayouts = descriptor_set_layout.address();
-    ci.pushConstantRangeCount = 0;
-    ci.pPushConstantRanges = nullptr;
+    const VkPipelineLayoutCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .setLayoutCount = 1,
+        .pSetLayouts = descriptor_set_layout.address(),
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = nullptr,
+    };
     pipeline_layout = device.GetLogical().CreatePipelineLayout(ci);
 }
 
 void VKBlitScreen::CreateGraphicsPipeline() {
-    std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages;
-    shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    shader_stages[0].pNext = nullptr;
-    shader_stages[0].flags = 0;
-    shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
-    shader_stages[0].module = *vertex_shader;
-    shader_stages[0].pName = "main";
-    shader_stages[0].pSpecializationInfo = nullptr;
-    shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    shader_stages[1].pNext = nullptr;
-    shader_stages[1].flags = 0;
-    shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
-    shader_stages[1].module = *fragment_shader;
-    shader_stages[1].pName = "main";
-    shader_stages[1].pSpecializationInfo = nullptr;
+    const std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages{{
+        {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .pNext = nullptr,
+            .flags = 0,
+            .stage = VK_SHADER_STAGE_VERTEX_BIT,
+            .module = *vertex_shader,
+            .pName = "main",
+            .pSpecializationInfo = nullptr,
+        },
+        {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .pNext = nullptr,
+            .flags = 0,
+            .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+            .module = *fragment_shader,
+            .pName = "main",
+            .pSpecializationInfo = nullptr,
+        },
+    }};
 
     const auto vertex_binding_description = ScreenRectVertex::GetDescription();
     const auto vertex_attrs_description = ScreenRectVertex::GetAttributes();
 
-    VkPipelineVertexInputStateCreateInfo vertex_input_ci;
-    vertex_input_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
-    vertex_input_ci.pNext = nullptr;
-    vertex_input_ci.flags = 0;
-    vertex_input_ci.vertexBindingDescriptionCount = 1;
-    vertex_input_ci.pVertexBindingDescriptions = &vertex_binding_description;
-    vertex_input_ci.vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()};
-    vertex_input_ci.pVertexAttributeDescriptions = vertex_attrs_description.data();
-
-    VkPipelineInputAssemblyStateCreateInfo input_assembly_ci;
-    input_assembly_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-    input_assembly_ci.pNext = nullptr;
-    input_assembly_ci.flags = 0;
-    input_assembly_ci.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP;
-    input_assembly_ci.primitiveRestartEnable = VK_FALSE;
-
-    VkPipelineViewportStateCreateInfo viewport_state_ci;
-    viewport_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
-    viewport_state_ci.pNext = nullptr;
-    viewport_state_ci.flags = 0;
-    viewport_state_ci.viewportCount = 1;
-    viewport_state_ci.pViewports = nullptr;
-    viewport_state_ci.scissorCount = 1;
-    viewport_state_ci.pScissors = nullptr;
-
-    VkPipelineRasterizationStateCreateInfo rasterization_ci;
-    rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
-    rasterization_ci.pNext = nullptr;
-    rasterization_ci.flags = 0;
-    rasterization_ci.depthClampEnable = VK_FALSE;
-    rasterization_ci.rasterizerDiscardEnable = VK_FALSE;
-    rasterization_ci.polygonMode = VK_POLYGON_MODE_FILL;
-    rasterization_ci.cullMode = VK_CULL_MODE_NONE;
-    rasterization_ci.frontFace = VK_FRONT_FACE_CLOCKWISE;
-    rasterization_ci.depthBiasEnable = VK_FALSE;
-    rasterization_ci.depthBiasConstantFactor = 0.0f;
-    rasterization_ci.depthBiasClamp = 0.0f;
-    rasterization_ci.depthBiasSlopeFactor = 0.0f;
-    rasterization_ci.lineWidth = 1.0f;
-
-    VkPipelineMultisampleStateCreateInfo multisampling_ci;
-    multisampling_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-    multisampling_ci.pNext = nullptr;
-    multisampling_ci.flags = 0;
-    multisampling_ci.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-    multisampling_ci.sampleShadingEnable = VK_FALSE;
-    multisampling_ci.minSampleShading = 0.0f;
-    multisampling_ci.pSampleMask = nullptr;
-    multisampling_ci.alphaToCoverageEnable = VK_FALSE;
-    multisampling_ci.alphaToOneEnable = VK_FALSE;
-
-    VkPipelineColorBlendAttachmentState color_blend_attachment;
-    color_blend_attachment.blendEnable = VK_FALSE;
-    color_blend_attachment.srcColorBlendFactor = VK_BLEND_FACTOR_ZERO;
-    color_blend_attachment.dstColorBlendFactor = VK_BLEND_FACTOR_ZERO;
-    color_blend_attachment.colorBlendOp = VK_BLEND_OP_ADD;
-    color_blend_attachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO;
-    color_blend_attachment.dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO;
-    color_blend_attachment.alphaBlendOp = VK_BLEND_OP_ADD;
-    color_blend_attachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
-                                            VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
-
-    VkPipelineColorBlendStateCreateInfo color_blend_ci;
-    color_blend_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-    color_blend_ci.flags = 0;
-    color_blend_ci.pNext = nullptr;
-    color_blend_ci.logicOpEnable = VK_FALSE;
-    color_blend_ci.logicOp = VK_LOGIC_OP_COPY;
-    color_blend_ci.attachmentCount = 1;
-    color_blend_ci.pAttachments = &color_blend_attachment;
-    color_blend_ci.blendConstants[0] = 0.0f;
-    color_blend_ci.blendConstants[1] = 0.0f;
-    color_blend_ci.blendConstants[2] = 0.0f;
-    color_blend_ci.blendConstants[3] = 0.0f;
-
-    static constexpr std::array dynamic_states = {VK_DYNAMIC_STATE_VIEWPORT,
-                                                  VK_DYNAMIC_STATE_SCISSOR};
-    VkPipelineDynamicStateCreateInfo dynamic_state_ci;
-    dynamic_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
-    dynamic_state_ci.pNext = nullptr;
-    dynamic_state_ci.flags = 0;
-    dynamic_state_ci.dynamicStateCount = static_cast<u32>(dynamic_states.size());
-    dynamic_state_ci.pDynamicStates = dynamic_states.data();
-
-    VkGraphicsPipelineCreateInfo pipeline_ci;
-    pipeline_ci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
-    pipeline_ci.pNext = nullptr;
-    pipeline_ci.flags = 0;
-    pipeline_ci.stageCount = static_cast<u32>(shader_stages.size());
-    pipeline_ci.pStages = shader_stages.data();
-    pipeline_ci.pVertexInputState = &vertex_input_ci;
-    pipeline_ci.pInputAssemblyState = &input_assembly_ci;
-    pipeline_ci.pTessellationState = nullptr;
-    pipeline_ci.pViewportState = &viewport_state_ci;
-    pipeline_ci.pRasterizationState = &rasterization_ci;
-    pipeline_ci.pMultisampleState = &multisampling_ci;
-    pipeline_ci.pDepthStencilState = nullptr;
-    pipeline_ci.pColorBlendState = &color_blend_ci;
-    pipeline_ci.pDynamicState = &dynamic_state_ci;
-    pipeline_ci.layout = *pipeline_layout;
-    pipeline_ci.renderPass = *renderpass;
-    pipeline_ci.subpass = 0;
-    pipeline_ci.basePipelineHandle = 0;
-    pipeline_ci.basePipelineIndex = 0;
+    const VkPipelineVertexInputStateCreateInfo vertex_input_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .vertexBindingDescriptionCount = 1,
+        .pVertexBindingDescriptions = &vertex_binding_description,
+        .vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()},
+        .pVertexAttributeDescriptions = vertex_attrs_description.data(),
+    };
+
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+        .primitiveRestartEnable = VK_FALSE,
+    };
+
+    const VkPipelineViewportStateCreateInfo viewport_state_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .viewportCount = 1,
+        .pViewports = nullptr,
+        .scissorCount = 1,
+        .pScissors = nullptr,
+    };
+
+    const VkPipelineRasterizationStateCreateInfo rasterization_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .depthClampEnable = VK_FALSE,
+        .rasterizerDiscardEnable = VK_FALSE,
+        .polygonMode = VK_POLYGON_MODE_FILL,
+        .cullMode = VK_CULL_MODE_NONE,
+        .frontFace = VK_FRONT_FACE_CLOCKWISE,
+        .depthBiasEnable = VK_FALSE,
+        .depthBiasConstantFactor = 0.0f,
+        .depthBiasClamp = 0.0f,
+        .depthBiasSlopeFactor = 0.0f,
+        .lineWidth = 1.0f,
+    };
+
+    const VkPipelineMultisampleStateCreateInfo multisampling_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+        .sampleShadingEnable = VK_FALSE,
+        .minSampleShading = 0.0f,
+        .pSampleMask = nullptr,
+        .alphaToCoverageEnable = VK_FALSE,
+        .alphaToOneEnable = VK_FALSE,
+    };
+
+    const VkPipelineColorBlendAttachmentState color_blend_attachment{
+        .blendEnable = VK_FALSE,
+        .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .colorBlendOp = VK_BLEND_OP_ADD,
+        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .alphaBlendOp = VK_BLEND_OP_ADD,
+        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
+    };
+
+    const VkPipelineColorBlendStateCreateInfo color_blend_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .logicOpEnable = VK_FALSE,
+        .logicOp = VK_LOGIC_OP_COPY,
+        .attachmentCount = 1,
+        .pAttachments = &color_blend_attachment,
+        .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f},
+    };
+
+    static constexpr std::array dynamic_states{
+        VK_DYNAMIC_STATE_VIEWPORT,
+        VK_DYNAMIC_STATE_SCISSOR,
+    };
+    const VkPipelineDynamicStateCreateInfo dynamic_state_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .dynamicStateCount = static_cast<u32>(dynamic_states.size()),
+        .pDynamicStates = dynamic_states.data(),
+    };
+
+    const VkGraphicsPipelineCreateInfo pipeline_ci{
+        .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .stageCount = static_cast<u32>(shader_stages.size()),
+        .pStages = shader_stages.data(),
+        .pVertexInputState = &vertex_input_ci,
+        .pInputAssemblyState = &input_assembly_ci,
+        .pTessellationState = nullptr,
+        .pViewportState = &viewport_state_ci,
+        .pRasterizationState = &rasterization_ci,
+        .pMultisampleState = &multisampling_ci,
+        .pDepthStencilState = nullptr,
+        .pColorBlendState = &color_blend_ci,
+        .pDynamicState = &dynamic_state_ci,
+        .layout = *pipeline_layout,
+        .renderPass = *renderpass,
+        .subpass = 0,
+        .basePipelineHandle = 0,
+        .basePipelineIndex = 0,
+    };
 
     pipeline = device.GetLogical().CreateGraphicsPipeline(pipeline_ci);
 }
 
 void VKBlitScreen::CreateSampler() {
-    VkSamplerCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.magFilter = VK_FILTER_LINEAR;
-    ci.minFilter = VK_FILTER_NEAREST;
-    ci.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
-    ci.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-    ci.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-    ci.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-    ci.mipLodBias = 0.0f;
-    ci.anisotropyEnable = VK_FALSE;
-    ci.maxAnisotropy = 0.0f;
-    ci.compareEnable = VK_FALSE;
-    ci.compareOp = VK_COMPARE_OP_NEVER;
-    ci.minLod = 0.0f;
-    ci.maxLod = 0.0f;
-    ci.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK;
-    ci.unnormalizedCoordinates = VK_FALSE;
+    const VkSamplerCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .magFilter = VK_FILTER_LINEAR,
+        .minFilter = VK_FILTER_NEAREST,
+        .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR,
+        .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+        .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+        .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+        .mipLodBias = 0.0f,
+        .anisotropyEnable = VK_FALSE,
+        .maxAnisotropy = 0.0f,
+        .compareEnable = VK_FALSE,
+        .compareOp = VK_COMPARE_OP_NEVER,
+        .minLod = 0.0f,
+        .maxLod = 0.0f,
+        .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK,
+        .unnormalizedCoordinates = VK_FALSE,
+    };
 
     sampler = device.GetLogical().CreateSampler(ci);
 }
@@ -650,15 +687,17 @@ void VKBlitScreen::CreateFramebuffers() {
     const VkExtent2D size{swapchain.GetSize()};
     framebuffers.resize(image_count);
 
-    VkFramebufferCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.renderPass = *renderpass;
-    ci.attachmentCount = 1;
-    ci.width = size.width;
-    ci.height = size.height;
-    ci.layers = 1;
+    VkFramebufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .renderPass = *renderpass,
+        .attachmentCount = 1,
+        .pAttachments = nullptr,
+        .width = size.width,
+        .height = size.height,
+        .layers = 1,
+    };
 
     for (std::size_t i = 0; i < image_count; ++i) {
         const VkImageView image_view{swapchain.GetImageViewIndex(i)};
@@ -669,7 +708,7 @@ void VKBlitScreen::CreateFramebuffers() {
 
 void VKBlitScreen::ReleaseRawImages() {
     for (std::size_t i = 0; i < raw_images.size(); ++i) {
-        watches[i]->Wait();
+        scheduler.Wait(resource_ticks.at(i));
     }
     raw_images.clear();
     raw_buffer_commits.clear();
@@ -678,16 +717,17 @@ void VKBlitScreen::ReleaseRawImages() {
 }
 
 void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) {
-    VkBufferCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.size = CalculateBufferSize(framebuffer);
-    ci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-               VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
+    const VkBufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = CalculateBufferSize(framebuffer),
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
 
     buffer = device.GetLogical().CreateBuffer(ci);
     buffer_commit = memory_manager.Commit(buffer, true);
@@ -697,24 +737,28 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer)
     raw_images.resize(image_count);
     raw_buffer_commits.resize(image_count);
 
-    VkImageCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.imageType = VK_IMAGE_TYPE_2D;
-    ci.format = GetFormat(framebuffer);
-    ci.extent.width = framebuffer.width;
-    ci.extent.height = framebuffer.height;
-    ci.extent.depth = 1;
-    ci.mipLevels = 1;
-    ci.arrayLayers = 1;
-    ci.samples = VK_SAMPLE_COUNT_1_BIT;
-    ci.tiling = VK_IMAGE_TILING_LINEAR;
-    ci.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
-    ci.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    const VkImageCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .imageType = VK_IMAGE_TYPE_2D,
+        .format = GetFormat(framebuffer),
+        .extent =
+            {
+                .width = framebuffer.width,
+                .height = framebuffer.height,
+                .depth = 1,
+            },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_LINEAR,
+        .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+    };
 
     for (std::size_t i = 0; i < image_count; ++i) {
         raw_images[i] = std::make_unique<VKImage>(device, scheduler, ci, VK_IMAGE_ASPECT_COLOR_BIT);
@@ -723,39 +767,43 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer)
 }
 
 void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const {
-    VkDescriptorBufferInfo buffer_info;
-    buffer_info.buffer = *buffer;
-    buffer_info.offset = offsetof(BufferData, uniform);
-    buffer_info.range = sizeof(BufferData::uniform);
-
-    VkWriteDescriptorSet ubo_write;
-    ubo_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    ubo_write.pNext = nullptr;
-    ubo_write.dstSet = descriptor_sets[image_index];
-    ubo_write.dstBinding = 0;
-    ubo_write.dstArrayElement = 0;
-    ubo_write.descriptorCount = 1;
-    ubo_write.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    ubo_write.pImageInfo = nullptr;
-    ubo_write.pBufferInfo = &buffer_info;
-    ubo_write.pTexelBufferView = nullptr;
-
-    VkDescriptorImageInfo image_info;
-    image_info.sampler = *sampler;
-    image_info.imageView = image_view;
-    image_info.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-
-    VkWriteDescriptorSet sampler_write;
-    sampler_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    sampler_write.pNext = nullptr;
-    sampler_write.dstSet = descriptor_sets[image_index];
-    sampler_write.dstBinding = 1;
-    sampler_write.dstArrayElement = 0;
-    sampler_write.descriptorCount = 1;
-    sampler_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-    sampler_write.pImageInfo = &image_info;
-    sampler_write.pBufferInfo = nullptr;
-    sampler_write.pTexelBufferView = nullptr;
+    const VkDescriptorBufferInfo buffer_info{
+        .buffer = *buffer,
+        .offset = offsetof(BufferData, uniform),
+        .range = sizeof(BufferData::uniform),
+    };
+
+    const VkWriteDescriptorSet ubo_write{
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = nullptr,
+        .dstSet = descriptor_sets[image_index],
+        .dstBinding = 0,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        .pImageInfo = nullptr,
+        .pBufferInfo = &buffer_info,
+        .pTexelBufferView = nullptr,
+    };
+
+    const VkDescriptorImageInfo image_info{
+        .sampler = *sampler,
+        .imageView = image_view,
+        .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+    };
+
+    const VkWriteDescriptorSet sampler_write{
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = nullptr,
+        .dstSet = descriptor_sets[image_index],
+        .dstBinding = 1,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        .pImageInfo = &image_info,
+        .pBufferInfo = nullptr,
+        .pTexelBufferView = nullptr,
+    };
 
     device.GetLogical().UpdateDescriptorSets(std::array{ubo_write, sampler_write}, {});
 }
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h
index 5eb544aea..8f2839214 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -4,18 +4,19 @@
 
 #pragma once
 
-#include <array>
 #include <memory>
-#include <tuple>
 
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Core {
 class System;
 }
 
+namespace Core::Memory {
+class Memory;
+}
+
 namespace Core::Frontend {
 class EmuWindow;
 }
@@ -31,26 +32,26 @@ class RasterizerInterface;
 namespace Vulkan {
 
 struct ScreenInfo;
+
 class RasterizerVulkan;
 class VKDevice;
-class VKFence;
 class VKImage;
 class VKScheduler;
 class VKSwapchain;
 
 class VKBlitScreen final {
 public:
-    explicit VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window,
+    explicit VKBlitScreen(Core::Memory::Memory& cpu_memory,
+                          Core::Frontend::EmuWindow& render_window,
                           VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
-                          VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                          VKSwapchain& swapchain, VKScheduler& scheduler,
-                          const VKScreenInfo& screen_info);
+                          VKMemoryManager& memory_manager, VKSwapchain& swapchain,
+                          VKScheduler& scheduler, const VKScreenInfo& screen_info);
     ~VKBlitScreen();
 
     void Recreate();
 
-    std::tuple<VKFence&, VkSemaphore> Draw(const Tegra::FramebufferConfig& framebuffer,
-                                           bool use_accelerated);
+    [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer,
+                                   bool use_accelerated);
 
 private:
     struct BufferData;
@@ -82,11 +83,10 @@ private:
     u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer,
                           std::size_t image_index) const;
 
-    Core::System& system;
+    Core::Memory::Memory& cpu_memory;
     Core::Frontend::EmuWindow& render_window;
     VideoCore::RasterizerInterface& rasterizer;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKSwapchain& swapchain;
     VKScheduler& scheduler;
@@ -107,7 +107,7 @@ private:
     vk::Buffer buffer;
     VKMemoryCommit buffer_commit;
 
-    std::vector<std::unique_ptr<VKFenceWatch>> watches;
+    std::vector<u64> resource_ticks;
 
     std::vector<vk::Semaphore> semaphores;
     std::vector<std::unique_ptr<VKImage>> raw_images;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 0d167afbd..d9d3da9ea 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -5,12 +5,9 @@
 #include <algorithm>
 #include <cstring>
 #include <memory>
-#include <optional>
-#include <tuple>
 
-#include "common/assert.h"
-#include "common/bit_util.h"
 #include "core/core.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -40,112 +37,88 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                                     VAddr cpu_addr, std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
-    VkBufferCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.size = static_cast<VkDeviceSize>(size);
-    ci.usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+    : BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
+    const VkBufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = static_cast<VkDeviceSize>(size),
+        .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
 
     buffer.handle = device.GetLogical().CreateBuffer(ci);
     buffer.commit = memory_manager.Commit(buffer.handle, false);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-const VkBuffer* VKBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-const VkBuffer* VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return empty.handle.address();
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                    const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = *buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
-
-        VkBufferMemoryBarrier barrier;
-        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-        barrier.pNext = nullptr;
-        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-        barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
-        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
-        barrier.offset = offset;
-        barrier.size = size;
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
+
+        const VkBufferMemoryBarrier barrier{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = UPLOAD_ACCESS_BARRIERS,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = handle,
+            .offset = offset,
+            .size = size,
+        };
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
                                barrier, {});
     });
 }
 
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                      u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = *buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        VkBufferMemoryBarrier barrier;
-        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-        barrier.pNext = nullptr;
-        barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
-        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
-        barrier.offset = offset;
-        barrier.size = size;
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        const VkBufferMemoryBarrier barrier{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = handle,
+            .offset = offset,
+            .size = size,
+        };
 
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
     });
     scheduler.Finish();
 
     std::memcpy(data, staging.commit->Map(size), size);
 }
 
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                              std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = *src->GetHandle(), dst_buffer = *dst->GetHandle(), src_offset,
-                      dst_offset, size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer dst_buffer = Handle();
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
+                      size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
         std::array<VkBufferMemoryBarrier, 2> barriers;
@@ -172,4 +145,31 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
     });
 }
 
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                             Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
+                             const VKDevice& device_, VKMemoryManager& memory_manager_,
+                             VKScheduler& scheduler_, VKStagingBufferPool& staging_pool_)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, gpu_memory, cpu_memory,
+                                                                 CreateStreamBuffer(device_,
+                                                                                    scheduler_)},
+      device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{
+                                                                                   staging_pool_} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+                                    size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+    size = std::max(size, std::size_t(4));
+    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, size, 0);
+    });
+    return {*empty.handle, 0, 0};
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index d3c23da98..7fb5ceedf 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -5,68 +5,60 @@
 #pragma once
 
 #include <memory>
-#include <unordered_map>
-#include <vector>
 
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
-namespace Core {
-class System;
-}
-
 namespace Vulkan {
 
 class VKDevice;
 class VKMemoryManager;
 class VKScheduler;
 
-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                               VAddr cpu_addr, std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
+
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
+
+    void Download(std::size_t offset, std::size_t size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size);
 
-    const VkBuffer* GetHandle() const {
-        return buffer.handle.address();
+    VkBuffer Handle() const {
+        return *buffer.handle;
+    }
+
+    u64 Address() const {
+        return 0;
     }
 
 private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
     VKBuffer buffer;
 };
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
-    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                           Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                            const VKDevice& device, VKMemoryManager& memory_manager,
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
 
-    const VkBuffer* GetEmptyBuffer(std::size_t size) override;
+    BufferInfo GetEmptyBuffer(std::size_t size) override;
 
 protected:
-    void WriteBarrier() override {}
-
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    const VkBuffer* ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
 private:
     const VKDevice& device;
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp
new file mode 100644
index 000000000..6339f4fe0
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp
@@ -0,0 +1,46 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+
+#include "video_core/renderer_vulkan/vk_command_pool.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+constexpr size_t COMMAND_BUFFER_POOL_SIZE = 0x1000;
+
+struct CommandPool::Pool {
+    vk::CommandPool handle;
+    vk::CommandBuffers cmdbufs;
+};
+
+CommandPool::CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device)
+    : ResourcePool(master_semaphore, COMMAND_BUFFER_POOL_SIZE), device{device} {}
+
+CommandPool::~CommandPool() = default;
+
+void CommandPool::Allocate(size_t begin, size_t end) {
+    // Command buffers are going to be commited, recorded, executed every single usage cycle.
+    // They are also going to be reseted when commited.
+    Pool& pool = pools.emplace_back();
+    pool.handle = device.GetLogical().CreateCommandPool({
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags =
+            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = device.GetGraphicsFamily(),
+    });
+    pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE);
+}
+
+VkCommandBuffer CommandPool::Commit() {
+    const size_t index = CommitResource();
+    const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE;
+    const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE;
+    return pools[pool_index].cmdbufs[sub_index];
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.h b/src/video_core/renderer_vulkan/vk_command_pool.h
new file mode 100644
index 000000000..b9cb3fb5d
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_command_pool.h
@@ -0,0 +1,34 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+class MasterSemaphore;
+class VKDevice;
+
+class CommandPool final : public ResourcePool {
+public:
+    explicit CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device);
+    ~CommandPool() override;
+
+    void Allocate(size_t begin, size_t end) override;
+
+    VkCommandBuffer Commit();
+
+private:
+    struct Pool;
+
+    const VKDevice& device;
+    std::vector<Pool> pools;
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 9d92305f4..9637c6059 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -6,7 +6,7 @@
 #include <memory>
 #include <optional>
 #include <utility>
-#include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
@@ -112,35 +112,36 @@ constexpr u8 quad_array[] = {
     0xf9, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00,
-    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
 
 VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
-    VkDescriptorSetLayoutBinding binding;
-    binding.binding = 0;
-    binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    binding.descriptorCount = 1;
-    binding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-    binding.pImmutableSamplers = nullptr;
-    return binding;
+    return {
+        .binding = 0,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    };
 }
 
 VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() {
-    VkDescriptorUpdateTemplateEntryKHR entry;
-    entry.dstBinding = 0;
-    entry.dstArrayElement = 0;
-    entry.descriptorCount = 1;
-    entry.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    entry.offset = 0;
-    entry.stride = sizeof(DescriptorUpdateEntry);
-    return entry;
+    return {
+        .dstBinding = 0,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .offset = 0,
+        .stride = sizeof(DescriptorUpdateEntry),
+    };
 }
 
-VkPushConstantRange BuildQuadArrayPassPushConstantRange() {
-    VkPushConstantRange range;
-    range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-    range.offset = 0;
-    range.size = sizeof(u32);
-    return range;
+VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
+    return {
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .offset = 0,
+        .size = static_cast<u32>(size),
+    };
 }
 
 // Uint8 SPIR-V module. Generated from the "shaders/" directory.
@@ -218,32 +219,161 @@ constexpr u8 uint8_pass[] = {
     0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
     0x24, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
     0xf9, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00,
-    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
-
-std::array<VkDescriptorSetLayoutBinding, 2> BuildUint8PassDescriptorSetBindings() {
-    std::array<VkDescriptorSetLayoutBinding, 2> bindings;
-    bindings[0].binding = 0;
-    bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    bindings[0].descriptorCount = 1;
-    bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-    bindings[0].pImmutableSamplers = nullptr;
-    bindings[1].binding = 1;
-    bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    bindings[1].descriptorCount = 1;
-    bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-    bindings[1].pImmutableSamplers = nullptr;
-    return bindings;
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
+
+// Quad indexed SPIR-V module. Generated from the "shaders/" directory.
+constexpr u8 QUAD_INDEXED_SPV[] = {
+    0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x7c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+    0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e,
+    0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+    0x3b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x02, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x21, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x2b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x3b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x3f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x09, 0x00, 0x41, 0x00, 0x00, 0x00,
+    0x44, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+    0x42, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+    0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+    0x56, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00,
+    0x56, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x57, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00,
+    0x70, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00,
+    0x47, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00,
+    0xf8, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+    0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00,
+    0xf8, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+    0x44, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+    0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+    0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+    0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
+    0x2b, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00,
+    0xf5, 0x00, 0x07, 0x00, 0x09, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+    0x1e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00,
+    0x1b, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00,
+    0xf6, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0xfa, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
+    0xf8, 0x00, 0x02, 0x00, 0x36, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x40, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+    0x47, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+    0xc3, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00,
+    0x2e, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+    0x4a, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x54, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+    0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+    0x4e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00,
+    0x5c, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+    0x5d, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+    0x41, 0x00, 0x05, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+    0x6a, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+    0x62, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x5b, 0x00, 0x00, 0x00,
+    0x6d, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+    0x3e, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
+    0xf9, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00,
+    0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00,
+    0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00,
+};
+
+std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBindings() {
+    return {{
+        {
+            .binding = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+            .pImmutableSamplers = nullptr,
+        },
+        {
+            .binding = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+            .pImmutableSamplers = nullptr,
+        },
+    }};
 }
 
-VkDescriptorUpdateTemplateEntryKHR BuildUint8PassDescriptorUpdateTemplateEntry() {
-    VkDescriptorUpdateTemplateEntryKHR entry;
-    entry.dstBinding = 0;
-    entry.dstArrayElement = 0;
-    entry.descriptorCount = 2;
-    entry.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    entry.offset = 0;
-    entry.stride = sizeof(DescriptorUpdateEntry);
-    return entry;
+VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() {
+    return {
+        .dstBinding = 0,
+        .dstArrayElement = 0,
+        .descriptorCount = 2,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .offset = 0,
+        .stride = sizeof(DescriptorUpdateEntry),
+    };
 }
 
 } // Anonymous namespace
@@ -253,37 +383,37 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto
                              vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates,
                              vk::Span<VkPushConstantRange> push_constants, std::size_t code_size,
                              const u8* code) {
-    VkDescriptorSetLayoutCreateInfo descriptor_layout_ci;
-    descriptor_layout_ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    descriptor_layout_ci.pNext = nullptr;
-    descriptor_layout_ci.flags = 0;
-    descriptor_layout_ci.bindingCount = bindings.size();
-    descriptor_layout_ci.pBindings = bindings.data();
-    descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(descriptor_layout_ci);
-
-    VkPipelineLayoutCreateInfo pipeline_layout_ci;
-    pipeline_layout_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    pipeline_layout_ci.pNext = nullptr;
-    pipeline_layout_ci.flags = 0;
-    pipeline_layout_ci.setLayoutCount = 1;
-    pipeline_layout_ci.pSetLayouts = descriptor_set_layout.address();
-    pipeline_layout_ci.pushConstantRangeCount = push_constants.size();
-    pipeline_layout_ci.pPushConstantRanges = push_constants.data();
-    layout = device.GetLogical().CreatePipelineLayout(pipeline_layout_ci);
+    descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .bindingCount = bindings.size(),
+        .pBindings = bindings.data(),
+    });
+
+    layout = device.GetLogical().CreatePipelineLayout({
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .setLayoutCount = 1,
+        .pSetLayouts = descriptor_set_layout.address(),
+        .pushConstantRangeCount = push_constants.size(),
+        .pPushConstantRanges = push_constants.data(),
+    });
 
     if (!templates.empty()) {
-        VkDescriptorUpdateTemplateCreateInfoKHR template_ci;
-        template_ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
-        template_ci.pNext = nullptr;
-        template_ci.flags = 0;
-        template_ci.descriptorUpdateEntryCount = templates.size();
-        template_ci.pDescriptorUpdateEntries = templates.data();
-        template_ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
-        template_ci.descriptorSetLayout = *descriptor_set_layout;
-        template_ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-        template_ci.pipelineLayout = *layout;
-        template_ci.set = 0;
-        descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR(template_ci);
+        descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR({
+            .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR,
+            .pNext = nullptr,
+            .flags = 0,
+            .descriptorUpdateEntryCount = templates.size(),
+            .pDescriptorUpdateEntries = templates.data(),
+            .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR,
+            .descriptorSetLayout = *descriptor_set_layout,
+            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+            .pipelineLayout = *layout,
+            .set = 0,
+        });
 
         descriptor_allocator.emplace(descriptor_pool, *descriptor_set_layout);
     }
@@ -291,42 +421,42 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto
     auto code_copy = std::make_unique<u32[]>(code_size / sizeof(u32) + 1);
     std::memcpy(code_copy.get(), code, code_size);
 
-    VkShaderModuleCreateInfo module_ci;
-    module_ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    module_ci.pNext = nullptr;
-    module_ci.flags = 0;
-    module_ci.codeSize = code_size;
-    module_ci.pCode = code_copy.get();
-    module = device.GetLogical().CreateShaderModule(module_ci);
-
-    VkComputePipelineCreateInfo pipeline_ci;
-    pipeline_ci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
-    pipeline_ci.pNext = nullptr;
-    pipeline_ci.flags = 0;
-    pipeline_ci.layout = *layout;
-    pipeline_ci.basePipelineHandle = nullptr;
-    pipeline_ci.basePipelineIndex = 0;
-
-    VkPipelineShaderStageCreateInfo& stage_ci = pipeline_ci.stage;
-    stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    stage_ci.pNext = nullptr;
-    stage_ci.flags = 0;
-    stage_ci.stage = VK_SHADER_STAGE_COMPUTE_BIT;
-    stage_ci.module = *module;
-    stage_ci.pName = "main";
-    stage_ci.pSpecializationInfo = nullptr;
-
-    pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci);
+    module = device.GetLogical().CreateShaderModule({
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .codeSize = code_size,
+        .pCode = code_copy.get(),
+    });
+
+    pipeline = device.GetLogical().CreateComputePipeline({
+        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .stage =
+            {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .pNext = nullptr,
+                .flags = 0,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = *module,
+                .pName = "main",
+                .pSpecializationInfo = nullptr,
+            },
+        .layout = *layout,
+        .basePipelineHandle = nullptr,
+        .basePipelineIndex = 0,
+    });
 }
 
 VKComputePass::~VKComputePass() = default;
 
-VkDescriptorSet VKComputePass::CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue,
-                                                   VKFence& fence) {
+VkDescriptorSet VKComputePass::CommitDescriptorSet(
+    VKUpdateDescriptorQueue& update_descriptor_queue) {
     if (!descriptor_template) {
         return nullptr;
     }
-    const auto set = descriptor_allocator->Commit(fence);
+    const VkDescriptorSet set = descriptor_allocator->Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
@@ -337,20 +467,20 @@ QuadArrayPass::QuadArrayPass(const VKDevice& device, VKScheduler& scheduler,
                              VKUpdateDescriptorQueue& update_descriptor_queue)
     : VKComputePass(device, descriptor_pool, BuildQuadArrayPassDescriptorSetLayoutBinding(),
                     BuildQuadArrayPassDescriptorUpdateTemplateEntry(),
-                    BuildQuadArrayPassPushConstantRange(), std::size(quad_array), quad_array),
+                    BuildComputePushConstantRange(sizeof(u32)), std::size(quad_array), quad_array),
       scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool},
       update_descriptor_queue{update_descriptor_queue} {}
 
 QuadArrayPass::~QuadArrayPass() = default;
 
-std::pair<const VkBuffer*, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
-    const u32 num_triangle_vertices = num_vertices * 6 / 4;
+std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
+    const u32 num_triangle_vertices = (num_vertices / 4) * 6;
     const std::size_t staging_size = num_triangle_vertices * sizeof(u32);
     auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false);
 
     update_descriptor_queue.Acquire();
-    update_descriptor_queue.AddBuffer(buffer.handle.address(), 0, staging_size);
-    const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence());
+    update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
 
     scheduler.RequestOutsideRenderPassOperationContext();
 
@@ -377,29 +507,29 @@ std::pair<const VkBuffer*, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertice
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {});
     });
-    return {buffer.handle.address(), 0};
+    return {*buffer.handle, 0};
 }
 
 Uint8Pass::Uint8Pass(const VKDevice& device, VKScheduler& scheduler,
                      VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool,
                      VKUpdateDescriptorQueue& update_descriptor_queue)
-    : VKComputePass(device, descriptor_pool, BuildUint8PassDescriptorSetBindings(),
-                    BuildUint8PassDescriptorUpdateTemplateEntry(), {}, std::size(uint8_pass),
+    : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(),
+                    BuildInputOutputDescriptorUpdateTemplate(), {}, std::size(uint8_pass),
                     uint8_pass),
       scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool},
       update_descriptor_queue{update_descriptor_queue} {}
 
 Uint8Pass::~Uint8Pass() = default;
 
-std::pair<const VkBuffer*, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
-                                                    u64 src_offset) {
-    const auto staging_size = static_cast<u32>(num_vertices * sizeof(u16));
+std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
+                                             u64 src_offset) {
+    const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
     auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false);
 
     update_descriptor_queue.Acquire();
-    update_descriptor_queue.AddBuffer(&src_buffer, src_offset, num_vertices);
-    update_descriptor_queue.AddBuffer(buffer.handle.address(), 0, staging_size);
-    const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence());
+    update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
+    update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
 
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set,
@@ -422,7 +552,73 @@ std::pair<const VkBuffer*, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer s
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
     });
-    return {buffer.handle.address(), 0};
+    return {*buffer.handle, 0};
+}
+
+QuadIndexedPass::QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler,
+                                 VKDescriptorPool& descriptor_pool,
+                                 VKStagingBufferPool& staging_buffer_pool,
+                                 VKUpdateDescriptorQueue& update_descriptor_queue)
+    : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(),
+                    BuildInputOutputDescriptorUpdateTemplate(),
+                    BuildComputePushConstantRange(sizeof(u32) * 2), std::size(QUAD_INDEXED_SPV),
+                    QUAD_INDEXED_SPV),
+      scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool},
+      update_descriptor_queue{update_descriptor_queue} {}
+
+QuadIndexedPass::~QuadIndexedPass() = default;
+
+std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
+    Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
+    VkBuffer src_buffer, u64 src_offset) {
+    const u32 index_shift = [index_format] {
+        switch (index_format) {
+        case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
+            return 0;
+        case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedShort:
+            return 1;
+        case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedInt:
+            return 2;
+        }
+        UNREACHABLE();
+        return 2;
+    }();
+    const u32 input_size = num_vertices << index_shift;
+    const u32 num_tri_vertices = (num_vertices / 4) * 6;
+
+    const std::size_t staging_size = num_tri_vertices * sizeof(u32);
+    auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false);
+
+    update_descriptor_queue.Acquire();
+    update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
+    update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size);
+    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set,
+                      num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
+        static constexpr u32 dispatch_size = 1024;
+        const std::array push_constants = {base_vertex, index_shift};
+        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
+        cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
+                             &push_constants);
+        cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1);
+
+        VkBufferMemoryBarrier barrier;
+        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+        barrier.pNext = nullptr;
+        barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
+        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.buffer = buffer;
+        barrier.offset = 0;
+        barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32));
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                               VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
+    });
+    return {*buffer.handle, 0};
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index c62516bff..acc94f27e 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -6,15 +6,15 @@
 
 #include <optional>
 #include <utility>
-#include <vector>
+
 #include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
 class VKScheduler;
 class VKStagingBufferPool;
 class VKUpdateDescriptorQueue;
@@ -29,8 +29,7 @@ public:
     ~VKComputePass();
 
 protected:
-    VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue,
-                                        VKFence& fence);
+    VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue);
 
     vk::DescriptorUpdateTemplateKHR descriptor_template;
     vk::PipelineLayout layout;
@@ -50,7 +49,7 @@ public:
                            VKUpdateDescriptorQueue& update_descriptor_queue);
     ~QuadArrayPass();
 
-    std::pair<const VkBuffer*, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
+    std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
 
 private:
     VKScheduler& scheduler;
@@ -65,7 +64,25 @@ public:
                        VKUpdateDescriptorQueue& update_descriptor_queue);
     ~Uint8Pass();
 
-    std::pair<const VkBuffer*, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset);
+    std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset);
+
+private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_buffer_pool;
+    VKUpdateDescriptorQueue& update_descriptor_queue;
+};
+
+class QuadIndexedPass final : public VKComputePass {
+public:
+    explicit QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler,
+                             VKDescriptorPool& descriptor_pool,
+                             VKStagingBufferPool& staging_buffer_pool,
+                             VKUpdateDescriptorQueue& update_descriptor_queue);
+    ~QuadIndexedPass();
+
+    std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
+                                      u32 num_vertices, u32 base_vertex, VkBuffer src_buffer,
+                                      u64 src_offset);
 
 private:
     VKScheduler& scheduler;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 23beafa4f..9be72dc9b 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -2,14 +2,12 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <memory>
 #include <vector>
 
 #include "video_core/renderer_vulkan/vk_compute_pipeline.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
@@ -34,7 +32,7 @@ VkDescriptorSet VKComputePipeline::CommitDescriptorSet() {
     if (!descriptor_template) {
         return {};
     }
-    const auto set = descriptor_allocator.Commit(scheduler.GetFence());
+    const VkDescriptorSet set = descriptor_allocator.Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
@@ -45,39 +43,41 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
     const auto add_bindings = [&](VkDescriptorType descriptor_type, std::size_t num_entries) {
         // TODO(Rodrigo): Maybe make individual bindings here?
         for (u32 bindpoint = 0; bindpoint < static_cast<u32>(num_entries); ++bindpoint) {
-            VkDescriptorSetLayoutBinding& entry = bindings.emplace_back();
-            entry.binding = binding++;
-            entry.descriptorType = descriptor_type;
-            entry.descriptorCount = 1;
-            entry.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-            entry.pImmutableSamplers = nullptr;
+            bindings.push_back({
+                .binding = binding++,
+                .descriptorType = descriptor_type,
+                .descriptorCount = 1,
+                .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+                .pImmutableSamplers = nullptr,
+            });
         }
     };
     add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
-    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
 
-    VkDescriptorSetLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.bindingCount = static_cast<u32>(bindings.size());
-    ci.pBindings = bindings.data();
-    return device.GetLogical().CreateDescriptorSetLayout(ci);
+    return device.GetLogical().CreateDescriptorSetLayout({
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .bindingCount = static_cast<u32>(bindings.size()),
+        .pBindings = bindings.data(),
+    });
 }
 
 vk::PipelineLayout VKComputePipeline::CreatePipelineLayout() const {
-    VkPipelineLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.setLayoutCount = 1;
-    ci.pSetLayouts = descriptor_set_layout.address();
-    ci.pushConstantRangeCount = 0;
-    ci.pPushConstantRanges = nullptr;
-    return device.GetLogical().CreatePipelineLayout(ci);
+    return device.GetLogical().CreatePipelineLayout({
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .setLayoutCount = 1,
+        .pSetLayouts = descriptor_set_layout.address(),
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = nullptr,
+    });
 }
 
 vk::DescriptorUpdateTemplateKHR VKComputePipeline::CreateDescriptorUpdateTemplate() const {
@@ -90,57 +90,63 @@ vk::DescriptorUpdateTemplateKHR VKComputePipeline::CreateDescriptorUpdateTemplat
         return {};
     }
 
-    VkDescriptorUpdateTemplateCreateInfoKHR ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.descriptorUpdateEntryCount = static_cast<u32>(template_entries.size());
-    ci.pDescriptorUpdateEntries = template_entries.data();
-    ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
-    ci.descriptorSetLayout = *descriptor_set_layout;
-    ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-    ci.pipelineLayout = *layout;
-    ci.set = DESCRIPTOR_SET;
-    return device.GetLogical().CreateDescriptorUpdateTemplateKHR(ci);
+    return device.GetLogical().CreateDescriptorUpdateTemplateKHR({
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .flags = 0,
+        .descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()),
+        .pDescriptorUpdateEntries = template_entries.data(),
+        .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR,
+        .descriptorSetLayout = *descriptor_set_layout,
+        .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        .pipelineLayout = *layout,
+        .set = DESCRIPTOR_SET,
+    });
 }
 
 vk::ShaderModule VKComputePipeline::CreateShaderModule(const std::vector<u32>& code) const {
-    VkShaderModuleCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.codeSize = code.size() * sizeof(u32);
-    ci.pCode = code.data();
-    return device.GetLogical().CreateShaderModule(ci);
+    device.SaveShader(code);
+
+    return device.GetLogical().CreateShaderModule({
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .codeSize = code.size() * sizeof(u32),
+        .pCode = code.data(),
+    });
 }
 
 vk::Pipeline VKComputePipeline::CreatePipeline() const {
-    VkComputePipelineCreateInfo ci;
-    VkPipelineShaderStageCreateInfo& stage_ci = ci.stage;
-    stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    stage_ci.pNext = nullptr;
-    stage_ci.flags = 0;
-    stage_ci.stage = VK_SHADER_STAGE_COMPUTE_BIT;
-    stage_ci.module = *shader_module;
-    stage_ci.pName = "main";
-    stage_ci.pSpecializationInfo = nullptr;
-
-    VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci;
-    subgroup_size_ci.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT;
-    subgroup_size_ci.pNext = nullptr;
-    subgroup_size_ci.requiredSubgroupSize = GuestWarpSize;
+
+    VkComputePipelineCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .stage =
+            {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .pNext = nullptr,
+                .flags = 0,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = *shader_module,
+                .pName = "main",
+                .pSpecializationInfo = nullptr,
+            },
+        .layout = *layout,
+        .basePipelineHandle = nullptr,
+        .basePipelineIndex = 0,
+    };
+
+    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .requiredSubgroupSize = GuestWarpSize,
+    };
 
     if (entries.uses_warps && device.IsGuestWarpSizeSupported(VK_SHADER_STAGE_COMPUTE_BIT)) {
-        stage_ci.pNext = &subgroup_size_ci;
+        ci.stage.pNext = &subgroup_size_ci;
     }
 
-    ci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.layout = *layout;
-    ci.basePipelineHandle = nullptr;
-    ci.basePipelineIndex = 0;
     return device.GetLogical().CreateComputePipeline(ci);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h
index 33b9af29e..6e2f22a4a 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h
@@ -4,8 +4,6 @@
 
 #pragma once
 
-#include <memory>
-
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index e9d528aa6..f38e089d5 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -2,13 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <memory>
 #include <vector>
 
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
@@ -16,14 +16,15 @@ namespace Vulkan {
 // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines.
 constexpr std::size_t SETS_GROW_RATE = 0x20;
 
-DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool,
-                                         VkDescriptorSetLayout layout)
-    : VKFencedPool{SETS_GROW_RATE}, descriptor_pool{descriptor_pool}, layout{layout} {}
+DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool_,
+                                         VkDescriptorSetLayout layout_)
+    : ResourcePool(descriptor_pool_.master_semaphore, SETS_GROW_RATE),
+      descriptor_pool{descriptor_pool_}, layout{layout_} {}
 
 DescriptorAllocator::~DescriptorAllocator() = default;
 
-VkDescriptorSet DescriptorAllocator::Commit(VKFence& fence) {
-    const std::size_t index = CommitResource(fence);
+VkDescriptorSet DescriptorAllocator::Commit() {
+    const std::size_t index = CommitResource();
     return descriptors_allocations[index / SETS_GROW_RATE][index % SETS_GROW_RATE];
 }
 
@@ -31,8 +32,9 @@ void DescriptorAllocator::Allocate(std::size_t begin, std::size_t end) {
     descriptors_allocations.push_back(descriptor_pool.AllocateDescriptors(layout, end - begin));
 }
 
-VKDescriptorPool::VKDescriptorPool(const VKDevice& device)
-    : device{device}, active_pool{AllocateNewPool()} {}
+VKDescriptorPool::VKDescriptorPool(const VKDevice& device_, VKScheduler& scheduler)
+    : device{device_}, master_semaphore{scheduler.GetMasterSemaphore()}, active_pool{
+                                                                             AllocateNewPool()} {}
 
 VKDescriptorPool::~VKDescriptorPool() = default;
 
@@ -43,27 +45,31 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
         {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
         {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
         {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
-        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
-
-    VkDescriptorPoolCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-    ci.maxSets = num_sets;
-    ci.poolSizeCount = static_cast<u32>(std::size(pool_sizes));
-    ci.pPoolSizes = std::data(pool_sizes);
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
+        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40},
+    };
+
+    const VkDescriptorPoolCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+        .maxSets = num_sets,
+        .poolSizeCount = static_cast<u32>(std::size(pool_sizes)),
+        .pPoolSizes = std::data(pool_sizes),
+    };
     return &pools.emplace_back(device.GetLogical().CreateDescriptorPool(ci));
 }
 
 vk::DescriptorSets VKDescriptorPool::AllocateDescriptors(VkDescriptorSetLayout layout,
                                                          std::size_t count) {
     const std::vector layout_copies(count, layout);
-    VkDescriptorSetAllocateInfo ai;
-    ai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
-    ai.pNext = nullptr;
-    ai.descriptorPool = **active_pool;
-    ai.descriptorSetCount = static_cast<u32>(count);
-    ai.pSetLayouts = layout_copies.data();
+    VkDescriptorSetAllocateInfo ai{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .descriptorPool = **active_pool,
+        .descriptorSetCount = static_cast<u32>(count),
+        .pSetLayouts = layout_copies.data(),
+    };
 
     vk::DescriptorSets sets = active_pool->Allocate(ai);
     if (!sets.IsOutOfPoolMemory()) {
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.h b/src/video_core/renderer_vulkan/vk_descriptor_pool.h
index ab40c70f0..544f32a20 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.h
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.h
@@ -4,25 +4,26 @@
 
 #pragma once
 
-#include <memory>
 #include <vector>
 
-#include "common/common_types.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
+class VKDevice;
 class VKDescriptorPool;
+class VKScheduler;
 
-class DescriptorAllocator final : public VKFencedPool {
+class DescriptorAllocator final : public ResourcePool {
 public:
     explicit DescriptorAllocator(VKDescriptorPool& descriptor_pool, VkDescriptorSetLayout layout);
     ~DescriptorAllocator() override;
 
+    DescriptorAllocator& operator=(const DescriptorAllocator&) = delete;
     DescriptorAllocator(const DescriptorAllocator&) = delete;
 
-    VkDescriptorSet Commit(VKFence& fence);
+    VkDescriptorSet Commit();
 
 protected:
     void Allocate(std::size_t begin, std::size_t end) override;
@@ -38,15 +39,19 @@ class VKDescriptorPool final {
     friend DescriptorAllocator;
 
 public:
-    explicit VKDescriptorPool(const VKDevice& device);
+    explicit VKDescriptorPool(const VKDevice& device, VKScheduler& scheduler);
     ~VKDescriptorPool();
 
+    VKDescriptorPool(const VKDescriptorPool&) = delete;
+    VKDescriptorPool& operator=(const VKDescriptorPool&) = delete;
+
 private:
     vk::DescriptorPool* AllocateNewPool();
 
     vk::DescriptorSets AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count);
 
     const VKDevice& device;
+    MasterSemaphore& master_semaphore;
 
     std::vector<vk::DescriptorPool> pools;
     vk::DescriptorPool* active_pool;
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 52d29e49d..f34ed6735 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -4,11 +4,11 @@
 
 #include <bitset>
 #include <chrono>
-#include <cstdlib>
 #include <optional>
 #include <string_view>
 #include <thread>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "common/assert.h"
@@ -22,19 +22,30 @@ namespace {
 
 namespace Alternatives {
 
-constexpr std::array Depth24UnormS8_UINT = {VK_FORMAT_D32_SFLOAT_S8_UINT,
-                                            VK_FORMAT_D16_UNORM_S8_UINT, VkFormat{}};
-constexpr std::array Depth16UnormS8_UINT = {VK_FORMAT_D24_UNORM_S8_UINT,
-                                            VK_FORMAT_D32_SFLOAT_S8_UINT, VkFormat{}};
+constexpr std::array Depth24UnormS8_UINT{
+    VK_FORMAT_D32_SFLOAT_S8_UINT,
+    VK_FORMAT_D16_UNORM_S8_UINT,
+    VkFormat{},
+};
+
+constexpr std::array Depth16UnormS8_UINT{
+    VK_FORMAT_D24_UNORM_S8_UINT,
+    VK_FORMAT_D32_SFLOAT_S8_UINT,
+    VkFormat{},
+};
 
 } // namespace Alternatives
 
-constexpr std::array REQUIRED_EXTENSIONS = {
+constexpr std::array REQUIRED_EXTENSIONS{
     VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+    VK_KHR_MAINTENANCE1_EXTENSION_NAME,
+    VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
+    VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
     VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
     VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
     VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME,
     VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME,
+    VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME,
     VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
     VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
     VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
@@ -71,76 +82,105 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
     }
 }
 
+[[nodiscard]] bool IsRDNA(std::string_view device_name, VkDriverIdKHR driver_id) {
+    static constexpr std::array RDNA_DEVICES{
+        "5700",
+        "5600",
+        "5500",
+        "5300",
+    };
+    if (driver_id != VK_DRIVER_ID_AMD_PROPRIETARY_KHR) {
+        return false;
+    }
+    return std::any_of(RDNA_DEVICES.begin(), RDNA_DEVICES.end(), [device_name](const char* name) {
+        return device_name.find(name) != std::string_view::npos;
+    });
+}
+
 std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
     vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
-    static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_UINT_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
-                                        VK_FORMAT_B5G6R5_UNORM_PACK16,
-                                        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
-                                        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
-                                        VK_FORMAT_R32G32B32A32_SFLOAT,
-                                        VK_FORMAT_R32G32B32A32_UINT,
-                                        VK_FORMAT_R32G32_SFLOAT,
-                                        VK_FORMAT_R32G32_UINT,
-                                        VK_FORMAT_R16G16B16A16_UINT,
-                                        VK_FORMAT_R16G16B16A16_SNORM,
-                                        VK_FORMAT_R16G16B16A16_UNORM,
-                                        VK_FORMAT_R16G16_UNORM,
-                                        VK_FORMAT_R16G16_SNORM,
-                                        VK_FORMAT_R16G16_SFLOAT,
-                                        VK_FORMAT_R16_UNORM,
-                                        VK_FORMAT_R8G8B8A8_SRGB,
-                                        VK_FORMAT_R8G8_UNORM,
-                                        VK_FORMAT_R8G8_SNORM,
-                                        VK_FORMAT_R8_UNORM,
-                                        VK_FORMAT_R8_UINT,
-                                        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
-                                        VK_FORMAT_R32_SFLOAT,
-                                        VK_FORMAT_R32_UINT,
-                                        VK_FORMAT_R32_SINT,
-                                        VK_FORMAT_R16_SFLOAT,
-                                        VK_FORMAT_R16G16B16A16_SFLOAT,
-                                        VK_FORMAT_B8G8R8A8_UNORM,
-                                        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
-                                        VK_FORMAT_D32_SFLOAT,
-                                        VK_FORMAT_D16_UNORM,
-                                        VK_FORMAT_D16_UNORM_S8_UINT,
-                                        VK_FORMAT_D24_UNORM_S8_UINT,
-                                        VK_FORMAT_D32_SFLOAT_S8_UINT,
-                                        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
-                                        VK_FORMAT_BC2_UNORM_BLOCK,
-                                        VK_FORMAT_BC3_UNORM_BLOCK,
-                                        VK_FORMAT_BC4_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_SNORM_BLOCK,
-                                        VK_FORMAT_BC7_UNORM_BLOCK,
-                                        VK_FORMAT_BC6H_UFLOAT_BLOCK,
-                                        VK_FORMAT_BC6H_SFLOAT_BLOCK,
-                                        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
-                                        VK_FORMAT_BC2_SRGB_BLOCK,
-                                        VK_FORMAT_BC3_SRGB_BLOCK,
-                                        VK_FORMAT_BC7_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
-                                        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32};
+    static constexpr std::array formats{
+        VK_FORMAT_A8B8G8R8_UNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_UINT_PACK32,
+        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_SINT_PACK32,
+        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
+        VK_FORMAT_B5G6R5_UNORM_PACK16,
+        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
+        VK_FORMAT_A2B10G10R10_UINT_PACK32,
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        VK_FORMAT_R32G32B32A32_SFLOAT,
+        VK_FORMAT_R32G32B32A32_SINT,
+        VK_FORMAT_R32G32B32A32_UINT,
+        VK_FORMAT_R32G32_SFLOAT,
+        VK_FORMAT_R32G32_SINT,
+        VK_FORMAT_R32G32_UINT,
+        VK_FORMAT_R16G16B16A16_SINT,
+        VK_FORMAT_R16G16B16A16_UINT,
+        VK_FORMAT_R16G16B16A16_SNORM,
+        VK_FORMAT_R16G16B16A16_UNORM,
+        VK_FORMAT_R16G16_UNORM,
+        VK_FORMAT_R16G16_SNORM,
+        VK_FORMAT_R16G16_SFLOAT,
+        VK_FORMAT_R16_UNORM,
+        VK_FORMAT_R16_UINT,
+        VK_FORMAT_R8G8B8A8_SRGB,
+        VK_FORMAT_R8G8_UNORM,
+        VK_FORMAT_R8G8_SNORM,
+        VK_FORMAT_R8G8_SINT,
+        VK_FORMAT_R8G8_UINT,
+        VK_FORMAT_R8_UNORM,
+        VK_FORMAT_R8_SNORM,
+        VK_FORMAT_R8_SINT,
+        VK_FORMAT_R8_UINT,
+        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
+        VK_FORMAT_R32_SFLOAT,
+        VK_FORMAT_R32_UINT,
+        VK_FORMAT_R32_SINT,
+        VK_FORMAT_R16_SFLOAT,
+        VK_FORMAT_R16G16B16A16_SFLOAT,
+        VK_FORMAT_B8G8R8A8_UNORM,
+        VK_FORMAT_B8G8R8A8_SRGB,
+        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
+        VK_FORMAT_D32_SFLOAT,
+        VK_FORMAT_D16_UNORM,
+        VK_FORMAT_D16_UNORM_S8_UINT,
+        VK_FORMAT_D24_UNORM_S8_UINT,
+        VK_FORMAT_D32_SFLOAT_S8_UINT,
+        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
+        VK_FORMAT_BC2_UNORM_BLOCK,
+        VK_FORMAT_BC3_UNORM_BLOCK,
+        VK_FORMAT_BC4_UNORM_BLOCK,
+        VK_FORMAT_BC4_SNORM_BLOCK,
+        VK_FORMAT_BC5_UNORM_BLOCK,
+        VK_FORMAT_BC5_SNORM_BLOCK,
+        VK_FORMAT_BC7_UNORM_BLOCK,
+        VK_FORMAT_BC6H_UFLOAT_BLOCK,
+        VK_FORMAT_BC6H_SFLOAT_BLOCK,
+        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
+        VK_FORMAT_BC2_SRGB_BLOCK,
+        VK_FORMAT_BC3_SRGB_BLOCK,
+        VK_FORMAT_BC7_SRGB_BLOCK,
+        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
+        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
+        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
+    };
     std::unordered_map<VkFormat, VkFormatProperties> format_properties;
     for (const auto format : formats) {
         format_properties.emplace(format, physical.GetFormatProperties(format));
@@ -150,10 +190,10 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
 
 } // Anonymous namespace
 
-VKDevice::VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface,
-                   const vk::InstanceDispatch& dld)
-    : dld{dld}, physical{physical}, properties{physical.GetProperties()},
-      format_properties{GetFormatProperties(physical, dld)} {
+VKDevice::VKDevice(VkInstance instance_, u32 instance_version_, vk::PhysicalDevice physical_,
+                   VkSurfaceKHR surface, const vk::InstanceDispatch& dld_)
+    : dld{dld_}, physical{physical_}, properties{physical.GetProperties()},
+      instance_version{instance_version_}, format_properties{GetFormatProperties(physical, dld)} {
     SetupFamilies(surface);
     SetupFeatures();
 }
@@ -164,107 +204,127 @@ bool VKDevice::Create() {
     const auto queue_cis = GetDeviceQueueCreateInfos();
     const std::vector extensions = LoadExtensions();
 
-    VkPhysicalDeviceFeatures2 features2;
-    features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    features2.pNext = nullptr;
+    VkPhysicalDeviceFeatures2 features2{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+        .pNext = nullptr,
+    };
+    const void* first_next = &features2;
     void** next = &features2.pNext;
 
-    auto& features = features2.features;
-    features.robustBufferAccess = false;
-    features.fullDrawIndexUint32 = false;
-    features.imageCubeArray = false;
-    features.independentBlend = true;
-    features.geometryShader = true;
-    features.tessellationShader = true;
-    features.sampleRateShading = false;
-    features.dualSrcBlend = false;
-    features.logicOp = false;
-    features.multiDrawIndirect = false;
-    features.drawIndirectFirstInstance = false;
-    features.depthClamp = true;
-    features.depthBiasClamp = true;
-    features.fillModeNonSolid = false;
-    features.depthBounds = false;
-    features.wideLines = false;
-    features.largePoints = true;
-    features.alphaToOne = false;
-    features.multiViewport = true;
-    features.samplerAnisotropy = true;
-    features.textureCompressionETC2 = false;
-    features.textureCompressionASTC_LDR = is_optimal_astc_supported;
-    features.textureCompressionBC = false;
-    features.occlusionQueryPrecise = true;
-    features.pipelineStatisticsQuery = false;
-    features.vertexPipelineStoresAndAtomics = true;
-    features.fragmentStoresAndAtomics = true;
-    features.shaderTessellationAndGeometryPointSize = false;
-    features.shaderImageGatherExtended = true;
-    features.shaderStorageImageExtendedFormats = false;
-    features.shaderStorageImageMultisample = false;
-    features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported;
-    features.shaderStorageImageWriteWithoutFormat = true;
-    features.shaderUniformBufferArrayDynamicIndexing = false;
-    features.shaderSampledImageArrayDynamicIndexing = false;
-    features.shaderStorageBufferArrayDynamicIndexing = false;
-    features.shaderStorageImageArrayDynamicIndexing = false;
-    features.shaderClipDistance = false;
-    features.shaderCullDistance = false;
-    features.shaderFloat64 = false;
-    features.shaderInt64 = false;
-    features.shaderInt16 = false;
-    features.shaderResourceResidency = false;
-    features.shaderResourceMinLod = false;
-    features.sparseBinding = false;
-    features.sparseResidencyBuffer = false;
-    features.sparseResidencyImage2D = false;
-    features.sparseResidencyImage3D = false;
-    features.sparseResidency2Samples = false;
-    features.sparseResidency4Samples = false;
-    features.sparseResidency8Samples = false;
-    features.sparseResidency16Samples = false;
-    features.sparseResidencyAliased = false;
-    features.variableMultisampleRate = false;
-    features.inheritedQueries = false;
-
-    VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage;
-    bit16_storage.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
-    bit16_storage.pNext = nullptr;
-    bit16_storage.storageBuffer16BitAccess = false;
-    bit16_storage.uniformAndStorageBuffer16BitAccess = true;
-    bit16_storage.storagePushConstant16 = false;
-    bit16_storage.storageInputOutput16 = false;
+    features2.features = {
+        .robustBufferAccess = false,
+        .fullDrawIndexUint32 = false,
+        .imageCubeArray = false,
+        .independentBlend = true,
+        .geometryShader = true,
+        .tessellationShader = true,
+        .sampleRateShading = false,
+        .dualSrcBlend = false,
+        .logicOp = false,
+        .multiDrawIndirect = false,
+        .drawIndirectFirstInstance = false,
+        .depthClamp = true,
+        .depthBiasClamp = true,
+        .fillModeNonSolid = false,
+        .depthBounds = false,
+        .wideLines = false,
+        .largePoints = true,
+        .alphaToOne = false,
+        .multiViewport = true,
+        .samplerAnisotropy = true,
+        .textureCompressionETC2 = false,
+        .textureCompressionASTC_LDR = is_optimal_astc_supported,
+        .textureCompressionBC = false,
+        .occlusionQueryPrecise = true,
+        .pipelineStatisticsQuery = false,
+        .vertexPipelineStoresAndAtomics = true,
+        .fragmentStoresAndAtomics = true,
+        .shaderTessellationAndGeometryPointSize = false,
+        .shaderImageGatherExtended = true,
+        .shaderStorageImageExtendedFormats = false,
+        .shaderStorageImageMultisample = false,
+        .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported,
+        .shaderStorageImageWriteWithoutFormat = true,
+        .shaderUniformBufferArrayDynamicIndexing = false,
+        .shaderSampledImageArrayDynamicIndexing = false,
+        .shaderStorageBufferArrayDynamicIndexing = false,
+        .shaderStorageImageArrayDynamicIndexing = false,
+        .shaderClipDistance = false,
+        .shaderCullDistance = false,
+        .shaderFloat64 = false,
+        .shaderInt64 = false,
+        .shaderInt16 = false,
+        .shaderResourceResidency = false,
+        .shaderResourceMinLod = false,
+        .sparseBinding = false,
+        .sparseResidencyBuffer = false,
+        .sparseResidencyImage2D = false,
+        .sparseResidencyImage3D = false,
+        .sparseResidency2Samples = false,
+        .sparseResidency4Samples = false,
+        .sparseResidency8Samples = false,
+        .sparseResidency16Samples = false,
+        .sparseResidencyAliased = false,
+        .variableMultisampleRate = false,
+        .inheritedQueries = false,
+    };
+
+    VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR,
+        .pNext = nullptr,
+        .timelineSemaphore = true,
+    };
+    SetNext(next, timeline_semaphore);
+
+    VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR,
+        .pNext = nullptr,
+        .storageBuffer16BitAccess = false,
+        .uniformAndStorageBuffer16BitAccess = true,
+        .storagePushConstant16 = false,
+        .storageInputOutput16 = false,
+    };
     SetNext(next, bit16_storage);
 
-    VkPhysicalDevice8BitStorageFeaturesKHR bit8_storage;
-    bit8_storage.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
-    bit8_storage.pNext = nullptr;
-    bit8_storage.storageBuffer8BitAccess = false;
-    bit8_storage.uniformAndStorageBuffer8BitAccess = true;
-    bit8_storage.storagePushConstant8 = false;
+    VkPhysicalDevice8BitStorageFeaturesKHR bit8_storage{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR,
+        .pNext = nullptr,
+        .storageBuffer8BitAccess = false,
+        .uniformAndStorageBuffer8BitAccess = true,
+        .storagePushConstant8 = false,
+    };
     SetNext(next, bit8_storage);
 
-    VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset;
-    host_query_reset.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT;
-    host_query_reset.hostQueryReset = true;
+    VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT,
+        .hostQueryReset = true,
+    };
     SetNext(next, host_query_reset);
 
     VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_float16_supported) {
-        float16_int8.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
-        float16_int8.pNext = nullptr;
-        float16_int8.shaderFloat16 = true;
-        float16_int8.shaderInt8 = false;
+        float16_int8 = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR,
+            .pNext = nullptr,
+            .shaderFloat16 = true,
+            .shaderInt8 = false,
+        };
         SetNext(next, float16_int8);
     } else {
         LOG_INFO(Render_Vulkan, "Device doesn't support float16 natively");
     }
 
+    if (!nv_viewport_swizzle) {
+        LOG_INFO(Render_Vulkan, "Device doesn't support viewport swizzles");
+    }
+
     VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR std430_layout;
     if (khr_uniform_buffer_standard_layout) {
-        std430_layout.sType =
-            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR;
-        std430_layout.pNext = nullptr;
-        std430_layout.uniformBufferStandardLayout = true;
+        std430_layout = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR,
+            .pNext = nullptr,
+            .uniformBufferStandardLayout = true,
+        };
         SetNext(next, std430_layout);
     } else {
         LOG_INFO(Render_Vulkan, "Device doesn't support packed UBOs");
@@ -272,9 +332,11 @@ bool VKDevice::Create() {
 
     VkPhysicalDeviceIndexTypeUint8FeaturesEXT index_type_uint8;
     if (ext_index_type_uint8) {
-        index_type_uint8.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT;
-        index_type_uint8.pNext = nullptr;
-        index_type_uint8.indexTypeUint8 = true;
+        index_type_uint8 = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT,
+            .pNext = nullptr,
+            .indexTypeUint8 = true,
+        };
         SetNext(next, index_type_uint8);
     } else {
         LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes");
@@ -282,21 +344,61 @@ bool VKDevice::Create() {
 
     VkPhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback;
     if (ext_transform_feedback) {
-        transform_feedback.sType =
-            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT;
-        transform_feedback.pNext = nullptr;
-        transform_feedback.transformFeedback = true;
-        transform_feedback.geometryStreams = true;
+        transform_feedback = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
+            .pNext = nullptr,
+            .transformFeedback = true,
+            .geometryStreams = true,
+        };
         SetNext(next, transform_feedback);
     } else {
         LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks");
     }
 
+    VkPhysicalDeviceCustomBorderColorFeaturesEXT custom_border;
+    if (ext_custom_border_color) {
+        custom_border = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT,
+            .pNext = nullptr,
+            .customBorderColors = VK_TRUE,
+            .customBorderColorWithoutFormat = VK_TRUE,
+        };
+        SetNext(next, custom_border);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support custom border colors");
+    }
+
+    VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state;
+    if (ext_extended_dynamic_state) {
+        dynamic_state = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT,
+            .pNext = nullptr,
+            .extendedDynamicState = VK_TRUE,
+        };
+        SetNext(next, dynamic_state);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support extended dynamic state");
+    }
+
     if (!ext_depth_range_unrestricted) {
         LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
     }
 
-    logical = vk::Device::Create(physical, queue_cis, extensions, features2, dld);
+    VkDeviceDiagnosticsConfigCreateInfoNV diagnostics_nv;
+    if (nv_device_diagnostics_config) {
+        nsight_aftermath_tracker.Initialize();
+
+        diagnostics_nv = {
+            .sType = VK_STRUCTURE_TYPE_DEVICE_DIAGNOSTICS_CONFIG_CREATE_INFO_NV,
+            .pNext = &features2,
+            .flags = VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_SHADER_DEBUG_INFO_BIT_NV |
+                     VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_RESOURCE_TRACKING_BIT_NV |
+                     VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_AUTOMATIC_CHECKPOINTS_BIT_NV,
+        };
+        first_next = &diagnostics_nv;
+    }
+
+    logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld);
     if (!logical) {
         LOG_ERROR(Render_Vulkan, "Failed to create logical device");
         return false;
@@ -304,8 +406,19 @@ bool VKDevice::Create() {
 
     CollectTelemetryParameters();
 
+    if (ext_extended_dynamic_state && IsRDNA(properties.deviceName, driver_id)) {
+        // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but on RDNA devices it
+        // seems to cause stability issues
+        LOG_WARNING(
+            Render_Vulkan,
+            "Blacklisting AMD proprietary on RDNA devices from VK_EXT_extended_dynamic_state");
+        ext_extended_dynamic_state = false;
+    }
+
     graphics_queue = logical.GetQueue(graphics_family);
     present_queue = logical.GetQueue(present_family);
+
+    use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
     return true;
 }
 
@@ -344,17 +457,12 @@ VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFla
 void VKDevice::ReportLoss() const {
     LOG_CRITICAL(Render_Vulkan, "Device loss occured!");
 
-    // Wait some time to let the log flush
-    std::this_thread::sleep_for(std::chrono::seconds{1});
-
-    if (!nv_device_diagnostic_checkpoints) {
-        return;
-    }
+    // Wait for the log to flush and for Nsight Aftermath to dump the results
+    std::this_thread::sleep_for(std::chrono::seconds{3});
+}
 
-    [[maybe_unused]] const std::vector data = graphics_queue.GetCheckpointDataNV(dld);
-    // Catch here in debug builds (or with optimizations disabled) the last graphics pipeline to be
-    // executed. It can be done on a debugger by evaluating the expression:
-    // *(VKGraphicsPipeline*)data[0]
+void VKDevice::SaveShader(const std::vector<u32>& spirv) const {
+    nsight_aftermath_tracker.SaveShader(spirv);
 }
 
 bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const {
@@ -492,43 +600,44 @@ bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) {
 
 std::vector<const char*> VKDevice::LoadExtensions() {
     std::vector<const char*> extensions;
-    const auto Test = [&](const VkExtensionProperties& extension,
-                          std::optional<std::reference_wrapper<bool>> status, const char* name,
-                          bool push) {
-        if (extension.extensionName != std::string_view(name)) {
-            return;
-        }
-        if (push) {
-            extensions.push_back(name);
-        }
-        if (status) {
-            status->get() = true;
-        }
-    };
-
     extensions.reserve(7 + REQUIRED_EXTENSIONS.size());
     extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end());
 
     bool has_khr_shader_float16_int8{};
     bool has_ext_subgroup_size_control{};
     bool has_ext_transform_feedback{};
-    for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) {
-        Test(extension, khr_uniform_buffer_standard_layout,
+    bool has_ext_custom_border_color{};
+    bool has_ext_extended_dynamic_state{};
+    for (const VkExtensionProperties& extension : physical.EnumerateDeviceExtensionProperties()) {
+        const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name,
+                              bool push) {
+            if (extension.extensionName != std::string_view(name)) {
+                return;
+            }
+            if (push) {
+                extensions.push_back(name);
+            }
+            if (status) {
+                status->get() = true;
+            }
+        };
+        test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true);
+        test(khr_uniform_buffer_standard_layout,
              VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
-        Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
-             false);
-        Test(extension, ext_depth_range_unrestricted,
-             VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
-        Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
-        Test(extension, ext_shader_viewport_index_layer,
-             VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
-        Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
-             false);
-        Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
-             false);
+        test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
+        test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
+        test(ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME,
+             true);
+        test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false);
+        test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false);
+        test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false);
+        if (instance_version >= VK_API_VERSION_1_1) {
+            test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false);
+        }
         if (Settings::values.renderer_debug) {
-            Test(extension, nv_device_diagnostic_checkpoints,
-                 VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true);
+            test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME,
+                 true);
         }
     }
 
@@ -598,6 +707,32 @@ std::vector<const char*> VKDevice::LoadExtensions() {
         }
     }
 
+    if (has_ext_custom_border_color) {
+        VkPhysicalDeviceCustomBorderColorFeaturesEXT border_features;
+        border_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT;
+        border_features.pNext = nullptr;
+        features.pNext = &border_features;
+        physical.GetFeatures2KHR(features);
+
+        if (border_features.customBorderColors && border_features.customBorderColorWithoutFormat) {
+            extensions.push_back(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME);
+            ext_custom_border_color = true;
+        }
+    }
+
+    if (has_ext_extended_dynamic_state) {
+        VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state;
+        dynamic_state.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT;
+        dynamic_state.pNext = nullptr;
+        features.pNext = &dynamic_state;
+        physical.GetFeatures2KHR(features);
+
+        if (dynamic_state.extendedDynamicState) {
+            extensions.push_back(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
+            ext_extended_dynamic_state = true;
+        }
+    }
+
     return extensions;
 }
 
@@ -633,14 +768,21 @@ void VKDevice::SetupFeatures() {
 }
 
 void VKDevice::CollectTelemetryParameters() {
-    VkPhysicalDeviceDriverPropertiesKHR driver;
-    driver.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR;
-    driver.pNext = nullptr;
+    VkPhysicalDeviceDriverPropertiesKHR driver{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR,
+        .pNext = nullptr,
+        .driverID = {},
+        .driverName = {},
+        .driverInfo = {},
+        .conformanceVersion = {},
+    };
 
-    VkPhysicalDeviceProperties2KHR properties;
-    properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
-    properties.pNext = &driver;
-    physical.GetProperties2KHR(properties);
+    VkPhysicalDeviceProperties2KHR device_properties{
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &driver,
+        .properties = {},
+    };
+    physical.GetProperties2KHR(device_properties);
 
     driver_id = driver.driverID;
     vendor_name = driver.driverName;
@@ -648,23 +790,26 @@ void VKDevice::CollectTelemetryParameters() {
     const std::vector extensions = physical.EnumerateDeviceExtensionProperties();
     reported_extensions.reserve(std::size(extensions));
     for (const auto& extension : extensions) {
-        reported_extensions.push_back(extension.extensionName);
+        reported_extensions.emplace_back(extension.extensionName);
     }
 }
 
 std::vector<VkDeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const {
     static constexpr float QUEUE_PRIORITY = 1.0f;
 
-    std::unordered_set<u32> unique_queue_families = {graphics_family, present_family};
+    std::unordered_set<u32> unique_queue_families{graphics_family, present_family};
     std::vector<VkDeviceQueueCreateInfo> queue_cis;
+    queue_cis.reserve(unique_queue_families.size());
 
     for (const u32 queue_family : unique_queue_families) {
-        VkDeviceQueueCreateInfo& ci = queue_cis.emplace_back();
-        ci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-        ci.pNext = nullptr;
-        ci.flags = 0;
-        ci.queueFamilyIndex = queue_family;
-        ci.queueCount = 1;
+        auto& ci = queue_cis.emplace_back(VkDeviceQueueCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+            .pNext = nullptr,
+            .flags = 0,
+            .queueFamilyIndex = queue_family,
+            .queueCount = 1,
+            .pQueuePriorities = nullptr,
+        });
         ci.pQueuePriorities = &QUEUE_PRIORITY;
     }
 
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 60d64572a..4286673d9 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "common/common_types.h"
+#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
@@ -23,8 +24,8 @@ const u32 GuestWarpSize = 32;
 /// Handles data specific to a physical device.
 class VKDevice final {
 public:
-    explicit VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface,
-                      const vk::InstanceDispatch& dld);
+    explicit VKDevice(VkInstance instance, u32 instance_version, vk::PhysicalDevice physical,
+                      VkSurfaceKHR surface, const vk::InstanceDispatch& dld);
     ~VKDevice();
 
     /// Initializes the device. Returns true on success.
@@ -43,6 +44,9 @@ public:
     /// Reports a device loss.
     void ReportLoss() const;
 
+    /// Reports a shader to Nsight Aftermath.
+    void SaveShader(const std::vector<u32>& spirv) const;
+
     /// Returns the dispatch loader with direct function pointers of the device.
     const vk::DeviceDispatch& GetDispatchLoader() const {
         return dld;
@@ -78,13 +82,13 @@ public:
         return present_family;
     }
 
-    /// Returns true if the device is integrated with the host CPU.
-    bool IsIntegrated() const {
-        return properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
+    /// Returns the current instance Vulkan API version in Vulkan-formatted version numbers.
+    u32 InstanceApiVersion() const {
+        return instance_version;
     }
 
     /// Returns the current Vulkan API version provided in Vulkan-formatted version numbers.
-    u32 GetApiVersion() const {
+    u32 ApiVersion() const {
         return properties.apiVersion;
     }
 
@@ -123,6 +127,11 @@ public:
         return properties.limits.maxPushConstantsSize;
     }
 
+    /// Returns the maximum size for shared memory.
+    u32 GetMaxComputeSharedMemorySize() const {
+        return properties.limits.maxComputeSharedMemorySize;
+    }
+
     /// Returns true if ASTC is natively supported.
     bool IsOptimalAstcSupported() const {
         return is_optimal_astc_supported;
@@ -148,6 +157,11 @@ public:
         return is_formatless_image_load_supported;
     }
 
+    /// Returns true if the device supports VK_NV_viewport_swizzle.
+    bool IsNvViewportSwizzleSupported() const {
+        return nv_viewport_swizzle;
+    }
+
     /// Returns true if the device supports VK_EXT_scalar_block_layout.
     bool IsKhrUniformBufferStandardLayoutSupported() const {
         return khr_uniform_buffer_standard_layout;
@@ -173,9 +187,14 @@ public:
         return ext_transform_feedback;
     }
 
-    /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints.
-    bool IsNvDeviceDiagnosticCheckpoints() const {
-        return nv_device_diagnostic_checkpoints;
+    /// Returns true if the device supports VK_EXT_custom_border_color.
+    bool IsExtCustomBorderColorSupported() const {
+        return ext_custom_border_color;
+    }
+
+    /// Returns true if the device supports VK_EXT_extended_dynamic_state.
+    bool IsExtExtendedDynamicStateSupported() const {
+        return ext_extended_dynamic_state;
     }
 
     /// Returns the vendor name reported from Vulkan.
@@ -188,6 +207,11 @@ public:
         return reported_extensions;
     }
 
+    /// Returns true if the setting for async shader compilation is enabled.
+    bool UseAsynchronousShaders() const {
+        return use_asynchronous_shaders;
+    }
+
     /// Checks if the physical device is suitable.
     static bool IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface);
 
@@ -220,6 +244,7 @@ private:
     vk::Device logical;                     ///< Logical device.
     vk::Queue graphics_queue;               ///< Main graphics queue.
     vk::Queue present_queue;                ///< Main present queue.
+    u32 instance_version{};                 ///< Vulkan onstance version.
     u32 graphics_family{};                  ///< Main graphics queue family index.
     u32 present_family{};                   ///< Main present queue family index.
     VkDriverIdKHR driver_id{};              ///< Driver ID.
@@ -228,12 +253,18 @@ private:
     bool is_float16_supported{};            ///< Support for float16 arithmetics.
     bool is_warp_potentially_bigger{};      ///< Host warp size can be bigger than guest.
     bool is_formatless_image_load_supported{}; ///< Support for shader image read without format.
+    bool nv_viewport_swizzle{};                ///< Support for VK_NV_viewport_swizzle.
     bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs.
     bool ext_index_type_uint8{};               ///< Support for VK_EXT_index_type_uint8.
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool ext_transform_feedback{};             ///< Support for VK_EXT_transform_feedback.
-    bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
+    bool ext_custom_border_color{};            ///< Support for VK_EXT_custom_border_color.
+    bool ext_extended_dynamic_state{};         ///< Support for VK_EXT_extended_dynamic_state.
+    bool nv_device_diagnostics_config{};       ///< Support for VK_NV_device_diagnostics_config.
+
+    // Asynchronous Graphics Pipeline setting
+    bool use_asynchronous_shaders{}; ///< Setting to use asynchronous shaders/graphics pipeline
 
     // Telemetry parameters
     std::string vendor_name;                      ///< Device's driver name.
@@ -241,6 +272,9 @@ private:
 
     /// Format properties dictionary.
     std::unordered_map<VkFormat, VkFormatProperties> format_properties;
+
+    /// Nsight Aftermath GPU crash tracker
+    NsightAftermathTracker nsight_aftermath_tracker;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
new file mode 100644
index 000000000..5babbdd0b
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -0,0 +1,101 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <memory>
+#include <thread>
+
+#include "video_core/renderer_vulkan/vk_buffer_cache.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_fence_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_texture_cache.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, bool is_stubbed)
+    : VideoCommon::FenceBase(payload, is_stubbed), device{device}, scheduler{scheduler} {}
+
+InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address,
+                       u32 payload, bool is_stubbed)
+    : VideoCommon::FenceBase(address, payload, is_stubbed), device{device}, scheduler{scheduler} {}
+
+InnerFence::~InnerFence() = default;
+
+void InnerFence::Queue() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(!event);
+
+    event = device.GetLogical().CreateEvent();
+    ticks = scheduler.CurrentTick();
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([event = *event](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetEvent(event, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    });
+}
+
+bool InnerFence::IsSignaled() const {
+    if (is_stubbed) {
+        return true;
+    }
+    ASSERT(event);
+    return IsEventSignalled();
+}
+
+void InnerFence::Wait() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(event);
+
+    if (ticks >= scheduler.CurrentTick()) {
+        scheduler.Flush();
+    }
+    while (!IsEventSignalled()) {
+        std::this_thread::yield();
+    }
+}
+
+bool InnerFence::IsEventSignalled() const {
+    switch (const VkResult result = event.GetStatus()) {
+    case VK_EVENT_SET:
+        return true;
+    case VK_EVENT_RESET:
+        return false;
+    default:
+        throw vk::Exception(result);
+    }
+}
+
+VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                               Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache,
+                               VKBufferCache& buffer_cache, VKQueryCache& query_cache,
+                               const VKDevice& device_, VKScheduler& scheduler_)
+    : GenericFenceManager(rasterizer, gpu, texture_cache, buffer_cache, query_cache),
+      device{device_}, scheduler{scheduler_} {}
+
+Fence VKFenceManager::CreateFence(u32 value, bool is_stubbed) {
+    return std::make_shared<InnerFence>(device, scheduler, value, is_stubbed);
+}
+
+Fence VKFenceManager::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) {
+    return std::make_shared<InnerFence>(device, scheduler, addr, value, is_stubbed);
+}
+
+void VKFenceManager::QueueFence(Fence& fence) {
+    fence->Queue();
+}
+
+bool VKFenceManager::IsFenceSignaled(Fence& fence) const {
+    return fence->IsSignaled();
+}
+
+void VKFenceManager::WaitFence(Fence& fence) {
+    fence->Wait();
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
new file mode 100644
index 000000000..1547d6d30
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -0,0 +1,75 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+#include "video_core/fence_manager.h"
+#include "video_core/renderer_vulkan/vk_buffer_cache.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Core {
+class System;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class VKBufferCache;
+class VKDevice;
+class VKQueryCache;
+class VKScheduler;
+class VKTextureCache;
+
+class InnerFence : public VideoCommon::FenceBase {
+public:
+    explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload,
+                        bool is_stubbed);
+    explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address,
+                        u32 payload, bool is_stubbed);
+    ~InnerFence();
+
+    void Queue();
+
+    bool IsSignaled() const;
+
+    void Wait();
+
+private:
+    bool IsEventSignalled() const;
+
+    const VKDevice& device;
+    VKScheduler& scheduler;
+    vk::Event event;
+    u64 ticks = 0;
+};
+using Fence = std::shared_ptr<InnerFence>;
+
+using GenericFenceManager =
+    VideoCommon::FenceManager<Fence, VKTextureCache, VKBufferCache, VKQueryCache>;
+
+class VKFenceManager final : public GenericFenceManager {
+public:
+    explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                            Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache,
+                            VKBufferCache& buffer_cache, VKQueryCache& query_cache,
+                            const VKDevice& device, VKScheduler& scheduler);
+
+protected:
+    Fence CreateFence(u32 value, bool is_stubbed) override;
+    Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override;
+    void QueueFence(Fence& fence) override;
+    bool IsFenceSignaled(Fence& fence) const override;
+    void WaitFence(Fence& fence) override;
+
+private:
+    const VKDevice& device;
+    VKScheduler& scheduler;
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index b540b838d..0e8f9c352 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -2,11 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <array>
 #include <cstring>
 #include <vector>
 
-#include "common/assert.h"
 #include "common/common_types.h"
 #include "common/microprofile.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
@@ -26,16 +26,17 @@ MICROPROFILE_DECLARE(Vulkan_PipelineCache);
 
 namespace {
 
-VkStencilOpState GetStencilFaceState(const FixedPipelineState::StencilFace& face) {
-    VkStencilOpState state;
-    state.failOp = MaxwellToVK::StencilOp(face.action_stencil_fail);
-    state.passOp = MaxwellToVK::StencilOp(face.action_depth_pass);
-    state.depthFailOp = MaxwellToVK::StencilOp(face.action_depth_fail);
-    state.compareOp = MaxwellToVK::ComparisonOp(face.test_func);
-    state.compareMask = 0;
-    state.writeMask = 0;
-    state.reference = 0;
-    return state;
+template <class StencilFace>
+VkStencilOpState GetStencilFaceState(const StencilFace& face) {
+    return {
+        .failOp = MaxwellToVK::StencilOp(face.ActionStencilFail()),
+        .passOp = MaxwellToVK::StencilOp(face.ActionDepthPass()),
+        .depthFailOp = MaxwellToVK::StencilOp(face.ActionDepthFail()),
+        .compareOp = MaxwellToVK::ComparisonOp(face.TestFunc()),
+        .compareMask = 0,
+        .writeMask = 0,
+        .reference = 0,
+    };
 }
 
 bool SupportsPrimitiveRestart(VkPrimitiveTopology topology) {
@@ -50,6 +51,24 @@ bool SupportsPrimitiveRestart(VkPrimitiveTopology topology) {
                      topology) == std::end(unsupported_topologies);
 }
 
+VkViewportSwizzleNV UnpackViewportSwizzle(u16 swizzle) {
+    union Swizzle {
+        u32 raw;
+        BitField<0, 3, Maxwell::ViewportSwizzle> x;
+        BitField<4, 3, Maxwell::ViewportSwizzle> y;
+        BitField<8, 3, Maxwell::ViewportSwizzle> z;
+        BitField<12, 3, Maxwell::ViewportSwizzle> w;
+    };
+    const Swizzle unpacked{swizzle};
+
+    return {
+        .x = MaxwellToVK::ViewportSwizzle(unpacked.x),
+        .y = MaxwellToVK::ViewportSwizzle(unpacked.y),
+        .z = MaxwellToVK::ViewportSwizzle(unpacked.z),
+        .w = MaxwellToVK::ViewportSwizzle(unpacked.w),
+    };
+}
+
 } // Anonymous namespace
 
 VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler,
@@ -59,15 +78,14 @@ VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& sche
                                        const GraphicsPipelineCacheKey& key,
                                        vk::Span<VkDescriptorSetLayoutBinding> bindings,
                                        const SPIRVProgram& program)
-    : device{device}, scheduler{scheduler}, fixed_state{key.fixed_state}, hash{key.Hash()},
+    : device{device}, scheduler{scheduler}, cache_key{key}, hash{cache_key.Hash()},
       descriptor_set_layout{CreateDescriptorSetLayout(bindings)},
       descriptor_allocator{descriptor_pool, *descriptor_set_layout},
       update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()},
       descriptor_template{CreateDescriptorUpdateTemplate(program)}, modules{CreateShaderModules(
                                                                         program)},
-      renderpass{renderpass_cache.GetRenderPass(key.renderpass_params)}, pipeline{CreatePipeline(
-                                                                             key.renderpass_params,
-                                                                             program)} {}
+      renderpass{renderpass_cache.GetRenderPass(cache_key.renderpass_params)},
+      pipeline{CreatePipeline(cache_key.renderpass_params, program)} {}
 
 VKGraphicsPipeline::~VKGraphicsPipeline() = default;
 
@@ -75,31 +93,33 @@ VkDescriptorSet VKGraphicsPipeline::CommitDescriptorSet() {
     if (!descriptor_template) {
         return {};
     }
-    const auto set = descriptor_allocator.Commit(scheduler.GetFence());
+    const VkDescriptorSet set = descriptor_allocator.Commit();
     update_descriptor_queue.Send(*descriptor_template, set);
     return set;
 }
 
 vk::DescriptorSetLayout VKGraphicsPipeline::CreateDescriptorSetLayout(
     vk::Span<VkDescriptorSetLayoutBinding> bindings) const {
-    VkDescriptorSetLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.bindingCount = bindings.size();
-    ci.pBindings = bindings.data();
+    const VkDescriptorSetLayoutCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .bindingCount = bindings.size(),
+        .pBindings = bindings.data(),
+    };
     return device.GetLogical().CreateDescriptorSetLayout(ci);
 }
 
 vk::PipelineLayout VKGraphicsPipeline::CreatePipelineLayout() const {
-    VkPipelineLayoutCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.setLayoutCount = 1;
-    ci.pSetLayouts = descriptor_set_layout.address();
-    ci.pushConstantRangeCount = 0;
-    ci.pPushConstantRanges = nullptr;
+    const VkPipelineLayoutCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .setLayoutCount = 1,
+        .pSetLayouts = descriptor_set_layout.address(),
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = nullptr,
+    };
     return device.GetLogical().CreatePipelineLayout(ci);
 }
 
@@ -118,26 +138,29 @@ vk::DescriptorUpdateTemplateKHR VKGraphicsPipeline::CreateDescriptorUpdateTempla
         return {};
     }
 
-    VkDescriptorUpdateTemplateCreateInfoKHR ci;
-    ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.descriptorUpdateEntryCount = static_cast<u32>(template_entries.size());
-    ci.pDescriptorUpdateEntries = template_entries.data();
-    ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
-    ci.descriptorSetLayout = *descriptor_set_layout;
-    ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-    ci.pipelineLayout = *layout;
-    ci.set = DESCRIPTOR_SET;
+    const VkDescriptorUpdateTemplateCreateInfoKHR ci{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .flags = 0,
+        .descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()),
+        .pDescriptorUpdateEntries = template_entries.data(),
+        .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR,
+        .descriptorSetLayout = *descriptor_set_layout,
+        .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        .pipelineLayout = *layout,
+        .set = DESCRIPTOR_SET,
+    };
     return device.GetLogical().CreateDescriptorUpdateTemplateKHR(ci);
 }
 
 std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
     const SPIRVProgram& program) const {
-    VkShaderModuleCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
+    VkShaderModuleCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .codeSize = 0,
+    };
 
     std::vector<vk::ShaderModule> modules;
     modules.reserve(Maxwell::MaxShaderStage);
@@ -147,6 +170,8 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
             continue;
         }
 
+        device.SaveShader(stage->code);
+
         ci.codeSize = stage->code.size() * sizeof(u32);
         ci.pCode = stage->code.data();
         modules.push_back(device.GetLogical().CreateShaderModule(ci));
@@ -156,186 +181,251 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
 
 vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpass_params,
                                                 const SPIRVProgram& program) const {
-    const auto& vi = fixed_state.vertex_input;
-    const auto& ia = fixed_state.input_assembly;
-    const auto& ds = fixed_state.depth_stencil;
-    const auto& cd = fixed_state.color_blending;
-    const auto& ts = fixed_state.tessellation;
-    const auto& rs = fixed_state.rasterizer;
+    const auto& state = cache_key.fixed_state;
+    const auto& viewport_swizzles = state.viewport_swizzles;
+
+    FixedPipelineState::DynamicState dynamic;
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        // Insert dummy values, as long as they are valid they don't matter as extended dynamic
+        // state is ignored
+        dynamic.raw1 = 0;
+        dynamic.raw2 = 0;
+        for (FixedPipelineState::VertexBinding& binding : dynamic.vertex_bindings) {
+            // Enable all vertex bindings
+            binding.raw = 0;
+            binding.enabled.Assign(1);
+        }
+    } else {
+        dynamic = state.dynamic_state;
+    }
 
     std::vector<VkVertexInputBindingDescription> vertex_bindings;
     std::vector<VkVertexInputBindingDivisorDescriptionEXT> vertex_binding_divisors;
-    for (std::size_t i = 0; i < vi.num_bindings; ++i) {
-        const auto& binding = vi.bindings[i];
-        const bool instanced = binding.divisor != 0;
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        const auto& binding = dynamic.vertex_bindings[index];
+        if (!binding.enabled) {
+            continue;
+        }
+        const bool instanced = state.binding_divisors[index] != 0;
         const auto rate = instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
 
-        auto& vertex_binding = vertex_bindings.emplace_back();
-        vertex_binding.binding = binding.index;
-        vertex_binding.stride = binding.stride;
-        vertex_binding.inputRate = rate;
+        vertex_bindings.push_back({
+            .binding = static_cast<u32>(index),
+            .stride = binding.stride,
+            .inputRate = rate,
+        });
 
         if (instanced) {
-            auto& binding_divisor = vertex_binding_divisors.emplace_back();
-            binding_divisor.binding = binding.index;
-            binding_divisor.divisor = binding.divisor;
+            vertex_binding_divisors.push_back({
+                .binding = static_cast<u32>(index),
+                .divisor = state.binding_divisors[index],
+            });
         }
     }
 
     std::vector<VkVertexInputAttributeDescription> vertex_attributes;
     const auto& input_attributes = program[0]->entries.attributes;
-    for (std::size_t i = 0; i < vi.num_attributes; ++i) {
-        const auto& attribute = vi.attributes[i];
-        if (input_attributes.find(attribute.index) == input_attributes.end()) {
+    for (std::size_t index = 0; index < state.attributes.size(); ++index) {
+        const auto& attribute = state.attributes[index];
+        if (!attribute.enabled) {
+            continue;
+        }
+        if (input_attributes.find(static_cast<u32>(index)) == input_attributes.end()) {
             // Skip attributes not used by the vertex shaders.
             continue;
         }
-        auto& vertex_attribute = vertex_attributes.emplace_back();
-        vertex_attribute.location = attribute.index;
-        vertex_attribute.binding = attribute.buffer;
-        vertex_attribute.format = MaxwellToVK::VertexFormat(attribute.type, attribute.size);
-        vertex_attribute.offset = attribute.offset;
+        vertex_attributes.push_back({
+            .location = static_cast<u32>(index),
+            .binding = attribute.buffer,
+            .format = MaxwellToVK::VertexFormat(attribute.Type(), attribute.Size()),
+            .offset = attribute.offset,
+        });
     }
 
-    VkPipelineVertexInputStateCreateInfo vertex_input_ci;
-    vertex_input_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
-    vertex_input_ci.pNext = nullptr;
-    vertex_input_ci.flags = 0;
-    vertex_input_ci.vertexBindingDescriptionCount = static_cast<u32>(vertex_bindings.size());
-    vertex_input_ci.pVertexBindingDescriptions = vertex_bindings.data();
-    vertex_input_ci.vertexAttributeDescriptionCount = static_cast<u32>(vertex_attributes.size());
-    vertex_input_ci.pVertexAttributeDescriptions = vertex_attributes.data();
-
-    VkPipelineVertexInputDivisorStateCreateInfoEXT input_divisor_ci;
-    input_divisor_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT;
-    input_divisor_ci.pNext = nullptr;
-    input_divisor_ci.vertexBindingDivisorCount = static_cast<u32>(vertex_binding_divisors.size());
-    input_divisor_ci.pVertexBindingDivisors = vertex_binding_divisors.data();
+    VkPipelineVertexInputStateCreateInfo vertex_input_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .vertexBindingDescriptionCount = static_cast<u32>(vertex_bindings.size()),
+        .pVertexBindingDescriptions = vertex_bindings.data(),
+        .vertexAttributeDescriptionCount = static_cast<u32>(vertex_attributes.size()),
+        .pVertexAttributeDescriptions = vertex_attributes.data(),
+    };
+
+    const VkPipelineVertexInputDivisorStateCreateInfoEXT input_divisor_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .vertexBindingDivisorCount = static_cast<u32>(vertex_binding_divisors.size()),
+        .pVertexBindingDivisors = vertex_binding_divisors.data(),
+    };
     if (!vertex_binding_divisors.empty()) {
         vertex_input_ci.pNext = &input_divisor_ci;
     }
 
-    VkPipelineInputAssemblyStateCreateInfo input_assembly_ci;
-    input_assembly_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-    input_assembly_ci.pNext = nullptr;
-    input_assembly_ci.flags = 0;
-    input_assembly_ci.topology = MaxwellToVK::PrimitiveTopology(device, ia.topology);
-    input_assembly_ci.primitiveRestartEnable =
-        ia.primitive_restart_enable && SupportsPrimitiveRestart(input_assembly_ci.topology);
-
-    VkPipelineTessellationStateCreateInfo tessellation_ci;
-    tessellation_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO;
-    tessellation_ci.pNext = nullptr;
-    tessellation_ci.flags = 0;
-    tessellation_ci.patchControlPoints = ts.patch_control_points;
-
-    VkPipelineViewportStateCreateInfo viewport_ci;
-    viewport_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
-    viewport_ci.pNext = nullptr;
-    viewport_ci.flags = 0;
-    viewport_ci.viewportCount = Maxwell::NumViewports;
-    viewport_ci.pViewports = nullptr;
-    viewport_ci.scissorCount = Maxwell::NumViewports;
-    viewport_ci.pScissors = nullptr;
-
-    VkPipelineRasterizationStateCreateInfo rasterization_ci;
-    rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
-    rasterization_ci.pNext = nullptr;
-    rasterization_ci.flags = 0;
-    rasterization_ci.depthClampEnable = rs.depth_clamp_enable;
-    rasterization_ci.rasterizerDiscardEnable = VK_FALSE;
-    rasterization_ci.polygonMode = VK_POLYGON_MODE_FILL;
-    rasterization_ci.cullMode =
-        rs.cull_enable ? MaxwellToVK::CullFace(rs.cull_face) : VK_CULL_MODE_NONE;
-    rasterization_ci.frontFace = MaxwellToVK::FrontFace(rs.front_face);
-    rasterization_ci.depthBiasEnable = rs.depth_bias_enable;
-    rasterization_ci.depthBiasConstantFactor = 0.0f;
-    rasterization_ci.depthBiasClamp = 0.0f;
-    rasterization_ci.depthBiasSlopeFactor = 0.0f;
-    rasterization_ci.lineWidth = 1.0f;
-
-    VkPipelineMultisampleStateCreateInfo multisample_ci;
-    multisample_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-    multisample_ci.pNext = nullptr;
-    multisample_ci.flags = 0;
-    multisample_ci.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-    multisample_ci.sampleShadingEnable = VK_FALSE;
-    multisample_ci.minSampleShading = 0.0f;
-    multisample_ci.pSampleMask = nullptr;
-    multisample_ci.alphaToCoverageEnable = VK_FALSE;
-    multisample_ci.alphaToOneEnable = VK_FALSE;
-
-    VkPipelineDepthStencilStateCreateInfo depth_stencil_ci;
-    depth_stencil_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
-    depth_stencil_ci.pNext = nullptr;
-    depth_stencil_ci.flags = 0;
-    depth_stencil_ci.depthTestEnable = ds.depth_test_enable;
-    depth_stencil_ci.depthWriteEnable = ds.depth_write_enable;
-    depth_stencil_ci.depthCompareOp = ds.depth_test_enable
-                                          ? MaxwellToVK::ComparisonOp(ds.depth_test_function)
-                                          : VK_COMPARE_OP_ALWAYS;
-    depth_stencil_ci.depthBoundsTestEnable = ds.depth_bounds_enable;
-    depth_stencil_ci.stencilTestEnable = ds.stencil_enable;
-    depth_stencil_ci.front = GetStencilFaceState(ds.front_stencil);
-    depth_stencil_ci.back = GetStencilFaceState(ds.back_stencil);
-    depth_stencil_ci.minDepthBounds = 0.0f;
-    depth_stencil_ci.maxDepthBounds = 0.0f;
+    const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, state.topology);
+    const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .topology = MaxwellToVK::PrimitiveTopology(device, state.topology),
+        .primitiveRestartEnable = state.primitive_restart_enable != 0 &&
+                                  SupportsPrimitiveRestart(input_assembly_topology),
+    };
+
+    const VkPipelineTessellationStateCreateInfo tessellation_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .patchControlPoints = state.patch_control_points_minus_one.Value() + 1,
+    };
+
+    VkPipelineViewportStateCreateInfo viewport_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .viewportCount = Maxwell::NumViewports,
+        .pViewports = nullptr,
+        .scissorCount = Maxwell::NumViewports,
+        .pScissors = nullptr,
+    };
+
+    std::array<VkViewportSwizzleNV, Maxwell::NumViewports> swizzles;
+    std::transform(viewport_swizzles.begin(), viewport_swizzles.end(), swizzles.begin(),
+                   UnpackViewportSwizzle);
+    VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV,
+        .pNext = nullptr,
+        .flags = 0,
+        .viewportCount = Maxwell::NumViewports,
+        .pViewportSwizzles = swizzles.data(),
+    };
+    if (device.IsNvViewportSwizzleSupported()) {
+        viewport_ci.pNext = &swizzle_ci;
+    }
+
+    const VkPipelineRasterizationStateCreateInfo rasterization_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .depthClampEnable =
+            static_cast<VkBool32>(state.depth_clamp_disabled == 0 ? VK_TRUE : VK_FALSE),
+        .rasterizerDiscardEnable =
+            static_cast<VkBool32>(state.rasterize_enable == 0 ? VK_TRUE : VK_FALSE),
+        .polygonMode = VK_POLYGON_MODE_FILL,
+        .cullMode =
+            dynamic.cull_enable ? MaxwellToVK::CullFace(dynamic.CullFace()) : VK_CULL_MODE_NONE,
+        .frontFace = MaxwellToVK::FrontFace(dynamic.FrontFace()),
+        .depthBiasEnable = state.depth_bias_enable,
+        .depthBiasConstantFactor = 0.0f,
+        .depthBiasClamp = 0.0f,
+        .depthBiasSlopeFactor = 0.0f,
+        .lineWidth = 1.0f,
+    };
+
+    const VkPipelineMultisampleStateCreateInfo multisample_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+        .sampleShadingEnable = VK_FALSE,
+        .minSampleShading = 0.0f,
+        .pSampleMask = nullptr,
+        .alphaToCoverageEnable = VK_FALSE,
+        .alphaToOneEnable = VK_FALSE,
+    };
+
+    const VkPipelineDepthStencilStateCreateInfo depth_stencil_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .depthTestEnable = dynamic.depth_test_enable,
+        .depthWriteEnable = dynamic.depth_write_enable,
+        .depthCompareOp = dynamic.depth_test_enable
+                              ? MaxwellToVK::ComparisonOp(dynamic.DepthTestFunc())
+                              : VK_COMPARE_OP_ALWAYS,
+        .depthBoundsTestEnable = dynamic.depth_bounds_enable,
+        .stencilTestEnable = dynamic.stencil_enable,
+        .front = GetStencilFaceState(dynamic.front),
+        .back = GetStencilFaceState(dynamic.back),
+        .minDepthBounds = 0.0f,
+        .maxDepthBounds = 0.0f,
+    };
 
     std::array<VkPipelineColorBlendAttachmentState, Maxwell::NumRenderTargets> cb_attachments;
-    const std::size_t num_attachments =
-        std::min(cd.attachments_count, renderpass_params.color_attachments.size());
-    for (std::size_t i = 0; i < num_attachments; ++i) {
-        static constexpr std::array component_table = {
-            VK_COLOR_COMPONENT_R_BIT, VK_COLOR_COMPONENT_G_BIT, VK_COLOR_COMPONENT_B_BIT,
-            VK_COLOR_COMPONENT_A_BIT};
-        const auto& blend = cd.attachments[i];
+    const auto num_attachments = static_cast<std::size_t>(renderpass_params.num_color_attachments);
+    for (std::size_t index = 0; index < num_attachments; ++index) {
+        static constexpr std::array COMPONENT_TABLE{
+            VK_COLOR_COMPONENT_R_BIT,
+            VK_COLOR_COMPONENT_G_BIT,
+            VK_COLOR_COMPONENT_B_BIT,
+            VK_COLOR_COMPONENT_A_BIT,
+        };
+        const auto& blend = state.attachments[index];
 
         VkColorComponentFlags color_components = 0;
-        for (std::size_t j = 0; j < component_table.size(); ++j) {
-            if (blend.components[j]) {
-                color_components |= component_table[j];
+        for (std::size_t i = 0; i < COMPONENT_TABLE.size(); ++i) {
+            if (blend.Mask()[i]) {
+                color_components |= COMPONENT_TABLE[i];
             }
         }
 
-        VkPipelineColorBlendAttachmentState& attachment = cb_attachments[i];
-        attachment.blendEnable = blend.enable;
-        attachment.srcColorBlendFactor = MaxwellToVK::BlendFactor(blend.src_rgb_func);
-        attachment.dstColorBlendFactor = MaxwellToVK::BlendFactor(blend.dst_rgb_func);
-        attachment.colorBlendOp = MaxwellToVK::BlendEquation(blend.rgb_equation);
-        attachment.srcAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.src_a_func);
-        attachment.dstAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.dst_a_func);
-        attachment.alphaBlendOp = MaxwellToVK::BlendEquation(blend.a_equation);
-        attachment.colorWriteMask = color_components;
+        cb_attachments[index] = {
+            .blendEnable = blend.enable != 0,
+            .srcColorBlendFactor = MaxwellToVK::BlendFactor(blend.SourceRGBFactor()),
+            .dstColorBlendFactor = MaxwellToVK::BlendFactor(blend.DestRGBFactor()),
+            .colorBlendOp = MaxwellToVK::BlendEquation(blend.EquationRGB()),
+            .srcAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.SourceAlphaFactor()),
+            .dstAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.DestAlphaFactor()),
+            .alphaBlendOp = MaxwellToVK::BlendEquation(blend.EquationAlpha()),
+            .colorWriteMask = color_components,
+        };
     }
 
-    VkPipelineColorBlendStateCreateInfo color_blend_ci;
-    color_blend_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-    color_blend_ci.pNext = nullptr;
-    color_blend_ci.flags = 0;
-    color_blend_ci.logicOpEnable = VK_FALSE;
-    color_blend_ci.logicOp = VK_LOGIC_OP_COPY;
-    color_blend_ci.attachmentCount = static_cast<u32>(num_attachments);
-    color_blend_ci.pAttachments = cb_attachments.data();
-    std::memset(color_blend_ci.blendConstants, 0, sizeof(color_blend_ci.blendConstants));
-
-    static constexpr std::array dynamic_states = {
+    const VkPipelineColorBlendStateCreateInfo color_blend_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .logicOpEnable = VK_FALSE,
+        .logicOp = VK_LOGIC_OP_COPY,
+        .attachmentCount = static_cast<u32>(num_attachments),
+        .pAttachments = cb_attachments.data(),
+        .blendConstants = {},
+    };
+
+    std::vector dynamic_states{
         VK_DYNAMIC_STATE_VIEWPORT,           VK_DYNAMIC_STATE_SCISSOR,
         VK_DYNAMIC_STATE_DEPTH_BIAS,         VK_DYNAMIC_STATE_BLEND_CONSTANTS,
         VK_DYNAMIC_STATE_DEPTH_BOUNDS,       VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
-        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE};
+        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+    };
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        static constexpr std::array extended{
+            VK_DYNAMIC_STATE_CULL_MODE_EXT,
+            VK_DYNAMIC_STATE_FRONT_FACE_EXT,
+            VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT,
+            VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_STENCIL_OP_EXT,
+        };
+        dynamic_states.insert(dynamic_states.end(), extended.begin(), extended.end());
+    }
 
-    VkPipelineDynamicStateCreateInfo dynamic_state_ci;
-    dynamic_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
-    dynamic_state_ci.pNext = nullptr;
-    dynamic_state_ci.flags = 0;
-    dynamic_state_ci.dynamicStateCount = static_cast<u32>(dynamic_states.size());
-    dynamic_state_ci.pDynamicStates = dynamic_states.data();
+    const VkPipelineDynamicStateCreateInfo dynamic_state_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .dynamicStateCount = static_cast<u32>(dynamic_states.size()),
+        .pDynamicStates = dynamic_states.data(),
+    };
 
-    VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci;
-    subgroup_size_ci.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT;
-    subgroup_size_ci.pNext = nullptr;
-    subgroup_size_ci.requiredSubgroupSize = GuestWarpSize;
+    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .requiredSubgroupSize = GuestWarpSize,
+    };
 
     std::vector<VkPipelineShaderStageCreateInfo> shader_stages;
     std::size_t module_index = 0;
@@ -343,6 +433,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         if (!program[stage]) {
             continue;
         }
+
         VkPipelineShaderStageCreateInfo& stage_ci = shader_stages.emplace_back();
         stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
         stage_ci.pNext = nullptr;
@@ -357,26 +448,27 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         }
     }
 
-    VkGraphicsPipelineCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.stageCount = static_cast<u32>(shader_stages.size());
-    ci.pStages = shader_stages.data();
-    ci.pVertexInputState = &vertex_input_ci;
-    ci.pInputAssemblyState = &input_assembly_ci;
-    ci.pTessellationState = &tessellation_ci;
-    ci.pViewportState = &viewport_ci;
-    ci.pRasterizationState = &rasterization_ci;
-    ci.pMultisampleState = &multisample_ci;
-    ci.pDepthStencilState = &depth_stencil_ci;
-    ci.pColorBlendState = &color_blend_ci;
-    ci.pDynamicState = &dynamic_state_ci;
-    ci.layout = *layout;
-    ci.renderPass = renderpass;
-    ci.subpass = 0;
-    ci.basePipelineHandle = nullptr;
-    ci.basePipelineIndex = 0;
+    const VkGraphicsPipelineCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .stageCount = static_cast<u32>(shader_stages.size()),
+        .pStages = shader_stages.data(),
+        .pVertexInputState = &vertex_input_ci,
+        .pInputAssemblyState = &input_assembly_ci,
+        .pTessellationState = &tessellation_ci,
+        .pViewportState = &viewport_ci,
+        .pRasterizationState = &rasterization_ci,
+        .pMultisampleState = &multisample_ci,
+        .pDepthStencilState = &depth_stencil_ci,
+        .pColorBlendState = &color_blend_ci,
+        .pDynamicState = &dynamic_state_ci,
+        .layout = *layout,
+        .renderPass = renderpass,
+        .subpass = 0,
+        .basePipelineHandle = nullptr,
+        .basePipelineIndex = 0,
+    };
     return device.GetLogical().CreateGraphicsPipeline(ci);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
index 7aba70960..58aa35efd 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@@ -5,16 +5,13 @@
 #pragma once
 
 #include <array>
-#include <memory>
 #include <optional>
-#include <unordered_map>
 #include <vector>
 
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
@@ -22,7 +19,27 @@ namespace Vulkan {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-struct GraphicsPipelineCacheKey;
+struct GraphicsPipelineCacheKey {
+    RenderPassParams renderpass_params;
+    u32 padding;
+    std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders;
+    FixedPipelineState fixed_state;
+
+    std::size_t Hash() const noexcept;
+
+    bool operator==(const GraphicsPipelineCacheKey& rhs) const noexcept;
+
+    bool operator!=(const GraphicsPipelineCacheKey& rhs) const noexcept {
+        return !operator==(rhs);
+    }
+
+    std::size_t Size() const noexcept {
+        return sizeof(renderpass_params) + sizeof(padding) + sizeof(shaders) + fixed_state.Size();
+    }
+};
+static_assert(std::has_unique_object_representations_v<GraphicsPipelineCacheKey>);
+static_assert(std::is_trivially_copyable_v<GraphicsPipelineCacheKey>);
+static_assert(std::is_trivially_constructible_v<GraphicsPipelineCacheKey>);
 
 class VKDescriptorPool;
 class VKDevice;
@@ -57,6 +74,10 @@ public:
         return renderpass;
     }
 
+    GraphicsPipelineCacheKey GetCacheKey() const {
+        return cache_key;
+    }
+
 private:
     vk::DescriptorSetLayout CreateDescriptorSetLayout(
         vk::Span<VkDescriptorSetLayoutBinding> bindings) const;
@@ -73,7 +94,7 @@ private:
 
     const VKDevice& device;
     VKScheduler& scheduler;
-    const FixedPipelineState fixed_state;
+    const GraphicsPipelineCacheKey cache_key;
     const u64 hash;
 
     vk::DescriptorSetLayout descriptor_set_layout;
diff --git a/src/video_core/renderer_vulkan/vk_image.cpp b/src/video_core/renderer_vulkan/vk_image.cpp
index 9bceb3861..1c418ea17 100644
--- a/src/video_core/renderer_vulkan/vk_image.cpp
+++ b/src/video_core/renderer_vulkan/vk_image.cpp
@@ -102,21 +102,29 @@ bool VKImage::HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num
 
 void VKImage::CreatePresentView() {
     // Image type has to be 2D to be presented.
-    VkImageViewCreateInfo image_view_ci;
-    image_view_ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-    image_view_ci.pNext = nullptr;
-    image_view_ci.flags = 0;
-    image_view_ci.image = *image;
-    image_view_ci.viewType = VK_IMAGE_VIEW_TYPE_2D;
-    image_view_ci.format = format;
-    image_view_ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
-                                VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
-    image_view_ci.subresourceRange.aspectMask = aspect_mask;
-    image_view_ci.subresourceRange.baseMipLevel = 0;
-    image_view_ci.subresourceRange.levelCount = 1;
-    image_view_ci.subresourceRange.baseArrayLayer = 0;
-    image_view_ci.subresourceRange.layerCount = 1;
-    present_view = device.GetLogical().CreateImageView(image_view_ci);
+    present_view = device.GetLogical().CreateImageView({
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .image = *image,
+        .viewType = VK_IMAGE_VIEW_TYPE_2D,
+        .format = format,
+        .components =
+            {
+                .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+            },
+        .subresourceRange =
+            {
+                .aspectMask = aspect_mask,
+                .baseMipLevel = 0,
+                .levelCount = 1,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+    });
 }
 
 VKImage::SubrangeState& VKImage::GetSubrangeState(u32 layer, u32 level) noexcept {
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
new file mode 100644
index 000000000..ae26e558d
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@@ -0,0 +1,56 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <atomic>
+#include <chrono>
+
+#include "core/settings.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+using namespace std::chrono_literals;
+
+MasterSemaphore::MasterSemaphore(const VKDevice& device) {
+    static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR,
+        .initialValue = 0,
+    };
+    static constexpr VkSemaphoreCreateInfo semaphore_ci{
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &semaphore_type_ci,
+        .flags = 0,
+    };
+    semaphore = device.GetLogical().CreateSemaphore(semaphore_ci);
+
+    if (!Settings::values.renderer_debug) {
+        return;
+    }
+    // Validation layers have a bug where they fail to track resource usage when using timeline
+    // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have
+    // a separate thread waiting for each timeline semaphore value.
+    debug_thread = std::thread([this] {
+        u64 counter = 0;
+        while (!shutdown) {
+            if (semaphore.Wait(counter, 10'000'000)) {
+                ++counter;
+            }
+        }
+    });
+}
+
+MasterSemaphore::~MasterSemaphore() {
+    shutdown = true;
+
+    // This thread might not be started
+    if (debug_thread.joinable()) {
+        debug_thread.join();
+    }
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
new file mode 100644
index 000000000..0e93706d7
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <thread>
+
+#include "common/common_types.h"
+#include "video_core/renderer_vulkan/wrapper.h"
+
+namespace Vulkan {
+
+class VKDevice;
+
+class MasterSemaphore {
+public:
+    explicit MasterSemaphore(const VKDevice& device);
+    ~MasterSemaphore();
+
+    /// Returns the current logical tick.
+    [[nodiscard]] u64 CurrentTick() const noexcept {
+        return current_tick;
+    }
+
+    /// Returns the timeline semaphore handle.
+    [[nodiscard]] VkSemaphore Handle() const noexcept {
+        return *semaphore;
+    }
+
+    /// Returns true when a tick has been hit by the GPU.
+    [[nodiscard]] bool IsFree(u64 tick) {
+        return gpu_tick >= tick;
+    }
+
+    /// Advance to the logical tick.
+    void NextTick() noexcept {
+        ++current_tick;
+    }
+
+    /// Refresh the known GPU tick
+    void Refresh() {
+        gpu_tick = semaphore.GetCounter();
+    }
+
+    /// Waits for a tick to be hit on the GPU
+    void Wait(u64 tick) {
+        // No need to wait if the GPU is ahead of the tick
+        if (IsFree(tick)) {
+            return;
+        }
+        // Update the GPU tick and try again
+        Refresh();
+        if (IsFree(tick)) {
+            return;
+        }
+        // If none of the above is hit, fallback to a regular wait
+        semaphore.Wait(tick);
+    }
+
+private:
+    vk::Semaphore semaphore;           ///< Timeline semaphore.
+    std::atomic<u64> gpu_tick{0};      ///< Current known GPU tick.
+    std::atomic<u64> current_tick{1};  ///< Current logical tick.
+    std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed.
+    std::thread debug_thread;          ///< Debug thread to workaround validation layer bugs.
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.cpp b/src/video_core/renderer_vulkan/vk_memory_manager.cpp
index 6a9e658bf..24c8960ac 100644
--- a/src/video_core/renderer_vulkan/vk_memory_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_memory_manager.cpp
@@ -118,8 +118,7 @@ private:
 };
 
 VKMemoryManager::VKMemoryManager(const VKDevice& device)
-    : device{device}, properties{device.GetPhysical().GetMemoryProperties()},
-      is_memory_unified{GetMemoryUnified(properties)} {}
+    : device{device}, properties{device.GetPhysical().GetMemoryProperties()} {}
 
 VKMemoryManager::~VKMemoryManager() = default;
 
@@ -179,13 +178,12 @@ bool VKMemoryManager::AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 t
     }();
 
     // Try to allocate found type.
-    VkMemoryAllocateInfo memory_ai;
-    memory_ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    memory_ai.pNext = nullptr;
-    memory_ai.allocationSize = size;
-    memory_ai.memoryTypeIndex = type;
-
-    vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory(memory_ai);
+    vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .allocationSize = size,
+        .memoryTypeIndex = type,
+    });
     if (!memory) {
         LOG_CRITICAL(Render_Vulkan, "Device allocation failed!");
         return false;
@@ -209,16 +207,6 @@ VKMemoryCommit VKMemoryManager::TryAllocCommit(const VkMemoryRequirements& requi
     return {};
 }
 
-bool VKMemoryManager::GetMemoryUnified(const VkPhysicalDeviceMemoryProperties& properties) {
-    for (u32 heap_index = 0; heap_index < properties.memoryHeapCount; ++heap_index) {
-        if (!(properties.memoryHeaps[heap_index].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)) {
-            // Memory is considered unified when heaps are device local only.
-            return false;
-        }
-    }
-    return true;
-}
-
 VKMemoryCommitImpl::VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation,
                                        const vk::DeviceMemory& memory, u64 begin, u64 end)
     : device{device}, memory{memory}, interval{begin, end}, allocation{allocation} {}
diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.h b/src/video_core/renderer_vulkan/vk_memory_manager.h
index 35ee54d30..1af88e3d4 100644
--- a/src/video_core/renderer_vulkan/vk_memory_manager.h
+++ b/src/video_core/renderer_vulkan/vk_memory_manager.h
@@ -32,7 +32,7 @@ public:
      *                     memory. When passing false, it will try to allocate device local memory.
      * @returns A memory commit.
      */
-    VKMemoryCommit Commit(const VkMemoryRequirements& reqs, bool host_visible);
+    VKMemoryCommit Commit(const VkMemoryRequirements& requirements, bool host_visible);
 
     /// Commits memory required by the buffer and binds it.
     VKMemoryCommit Commit(const vk::Buffer& buffer, bool host_visible);
@@ -40,11 +40,6 @@ public:
     /// Commits memory required by the image and binds it.
     VKMemoryCommit Commit(const vk::Image& image, bool host_visible);
 
-    /// Returns true if the memory allocations are done always in host visible and coherent memory.
-    bool IsMemoryUnified() const {
-        return is_memory_unified;
-    }
-
 private:
     /// Allocates a chunk of memory.
     bool AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 type_mask, u64 size);
@@ -53,12 +48,8 @@ private:
     VKMemoryCommit TryAllocCommit(const VkMemoryRequirements& requirements,
                                   VkMemoryPropertyFlags wanted_properties);
 
-    /// Returns true if the device uses an unified memory model.
-    static bool GetMemoryUnified(const VkPhysicalDeviceMemoryProperties& properties);
-
-    const VKDevice& device;                            ///< Device handler.
-    const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties.
-    const bool is_memory_unified;                      ///< True if memory model is unified.
+    const VKDevice& device;                                       ///< Device handler.
+    const VkPhysicalDeviceMemoryProperties properties;            ///< Physical device properties.
     std::vector<std::unique_ptr<VKMemoryAllocation>> allocations; ///< Current allocations.
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 90e3a8edd..dedc9c466 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -22,17 +22,24 @@
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/shader/compiler_settings.h"
+#include "video_core/shader/memory_util.h"
+#include "video_core/shader_cache.h"
+#include "video_core/shader_notify.h"
 
 namespace Vulkan {
 
 MICROPROFILE_DECLARE(Vulkan_PipelineCache);
 
 using Tegra::Engines::ShaderType;
+using VideoCommon::Shader::GetShaderAddress;
+using VideoCommon::Shader::GetShaderCode;
+using VideoCommon::Shader::KERNEL_MAIN_OFFSET;
+using VideoCommon::Shader::ProgramCode;
+using VideoCommon::Shader::STAGE_MAIN_OFFSET;
 
 namespace {
 
@@ -40,65 +47,12 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
 constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
 constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
 
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
     VideoCommon::Shader::CompileDepth::FullDecompile};
 
-/// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
-    const auto& gpu{system.GPU().Maxwell3D()};
-    const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
-    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
-}
-
-/// Gets if the current instruction offset is a scheduler instruction
-constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
-    // Sched instructions appear once every 4 instructions.
-    constexpr std::size_t SchedPeriod = 4;
-    const std::size_t absolute_offset = offset - main_offset;
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
-/// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute) {
-    const std::size_t start_offset = is_compute ? 0 : 10;
-    // This is the encoded version of BRA that jumps to itself. All Nvidia
-    // shaders end with one.
-    constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL;
-    constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL;
-    std::size_t offset = start_offset;
-    while (offset < program.size()) {
-        const u64 instruction = program[offset];
-        if (!IsSchedInstruction(offset, start_offset)) {
-            if ((instruction & mask) == self_jumping_branch) {
-                // End on Maxwell's "nop" instruction
-                break;
-            }
-            if (instruction == 0) {
-                break;
-            }
-        }
-        ++offset;
-    }
-    // The last instruction is included in the program size
-    return std::min(offset + 1, program.size());
-}
-
-/// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
-                          const u8* host_ptr, bool is_compute) {
-    ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
-    ASSERT_OR_EXECUTE(host_ptr != nullptr, {
-        std::fill(program_code.begin(), program_code.end(), 0);
-        return program_code;
-    });
-    memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
-                                   program_code.size() * sizeof(u64));
-    program_code.resize(CalculateProgramSize(program_code, is_compute));
-    return program_code;
-}
-
 constexpr std::size_t GetStageFromProgram(std::size_t program) {
     return program == 0 ? 0 : program - 1;
 }
@@ -133,14 +87,15 @@ void AddBindings(std::vector<VkDescriptorSetLayoutBinding>& bindings, u32& bindi
         u32 count = 1;
         if constexpr (descriptor_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
             // Combined image samplers can be arrayed.
-            count = container[i].Size();
+            count = container[i].size;
         }
-        VkDescriptorSetLayoutBinding& entry = bindings.emplace_back();
-        entry.binding = binding++;
-        entry.descriptorType = descriptor_type;
-        entry.descriptorCount = count;
-        entry.stageFlags = stage_flags;
-        entry.pImmutableSamplers = nullptr;
+        bindings.push_back({
+            .binding = binding++,
+            .descriptorType = descriptor_type,
+            .descriptorCount = count,
+            .stageFlags = stage_flags,
+            .pImmutableSamplers = nullptr,
+        });
     }
 }
 
@@ -153,96 +108,133 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
     u32 binding = base_binding;
     AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
     AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
-    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
     AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
+    AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
     AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
     return binding;
 }
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage,
-                           GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code,
-                           u32 main_offset)
-    : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)},
-      registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
-                                                           compiler_settings, registry},
-      entries{GenerateShaderEntries(shader_ir)} {}
-
-CachedShader::~CachedShader() = default;
-
-Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine(
-    Core::System& system, Tegra::Engines::ShaderType stage) {
-    if (stage == Tegra::Engines::ShaderType::Compute) {
-        return system.GPU().KeplerCompute();
-    } else {
-        return system.GPU().Maxwell3D();
-    }
+std::size_t GraphicsPipelineCacheKey::Hash() const noexcept {
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size());
+    return static_cast<std::size_t>(hash);
+}
+
+bool GraphicsPipelineCacheKey::operator==(const GraphicsPipelineCacheKey& rhs) const noexcept {
+    return std::memcmp(&rhs, this, Size()) == 0;
+}
+
+std::size_t ComputePipelineCacheKey::Hash() const noexcept {
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this);
+    return static_cast<std::size_t>(hash);
+}
+
+bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) const noexcept {
+    return std::memcmp(&rhs, this, sizeof *this) == 0;
 }
 
-VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
-                                 const VKDevice& device, VKScheduler& scheduler,
-                                 VKDescriptorPool& descriptor_pool,
-                                 VKUpdateDescriptorQueue& update_descriptor_queue,
-                                 VKRenderPassCache& renderpass_cache)
-    : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler},
-      descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue},
-      renderpass_cache{renderpass_cache} {}
+Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine, Tegra::Engines::ShaderType stage,
+               GPUVAddr gpu_addr_, VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code_,
+               u32 main_offset)
+    : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage, engine),
+      shader_ir(program_code, main_offset, compiler_settings, registry),
+      entries(GenerateShaderEntries(shader_ir)) {}
+
+Shader::~Shader() = default;
+
+VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu_,
+                                 Tegra::Engines::Maxwell3D& maxwell3d_,
+                                 Tegra::Engines::KeplerCompute& kepler_compute_,
+                                 Tegra::MemoryManager& gpu_memory_, const VKDevice& device_,
+                                 VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_,
+                                 VKUpdateDescriptorQueue& update_descriptor_queue_,
+                                 VKRenderPassCache& renderpass_cache_)
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, gpu{gpu_}, maxwell3d{maxwell3d_},
+      kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, device{device_},
+      scheduler{scheduler_}, descriptor_pool{descriptor_pool_},
+      update_descriptor_queue{update_descriptor_queue_}, renderpass_cache{renderpass_cache_} {}
 
 VKPipelineCache::~VKPipelineCache() = default;
 
-std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
-    const auto& gpu = system.GPU().Maxwell3D();
+std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
+    std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
 
-    std::array<Shader, Maxwell::MaxShaderProgram> shaders;
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             continue;
         }
 
-        auto& memory_manager{system.GPU().MemoryManager()};
-        const GPUVAddr program_addr{GetShaderAddress(system, program)};
-        const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
+        const GPUVAddr gpu_addr{GetShaderAddress(maxwell3d, program)};
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         ASSERT(cpu_addr);
-        auto shader = cpu_addr ? TryGet(*cpu_addr) : nullptr;
-        if (!shader) {
-            const auto host_ptr{memory_manager.GetPointer(program_addr)};
 
-            // No shader found - create a new one
-            constexpr u32 stage_offset = 10;
-            const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1);
-            auto code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
+        Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
+        if (!result) {
+            const u8* const host_ptr{gpu_memory.GetPointer(gpu_addr)};
 
-            shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr,
-                                                    std::move(code), stage_offset);
-            Register(shader);
+            // No shader found - create a new one
+            static constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
+            const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
+            ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, false);
+            const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+            auto shader = std::make_unique<Shader>(maxwell3d, stage, gpu_addr, *cpu_addr,
+                                                   std::move(code), stage_offset);
+            result = shader.get();
+
+            if (cpu_addr) {
+                Register(std::move(shader), *cpu_addr, size_in_bytes);
+            } else {
+                null_shader = std::move(shader);
+            }
         }
-        shaders[index] = std::move(shader);
+        shaders[index] = result;
     }
     return last_shaders = shaders;
 }
 
-VKGraphicsPipeline& VKPipelineCache::GetGraphicsPipeline(const GraphicsPipelineCacheKey& key) {
+VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline(
+    const GraphicsPipelineCacheKey& key, VideoCommon::Shader::AsyncShaders& async_shaders) {
     MICROPROFILE_SCOPE(Vulkan_PipelineCache);
 
     if (last_graphics_pipeline && last_graphics_key == key) {
-        return *last_graphics_pipeline;
+        return last_graphics_pipeline;
     }
     last_graphics_key = key;
 
+    if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(gpu)) {
+        std::unique_lock lock{pipeline_cache};
+        const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key);
+        if (is_cache_miss) {
+            gpu.ShaderNotify().MarkSharderBuilding();
+            LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
+            const auto [program, bindings] = DecompileShaders(key.fixed_state);
+            async_shaders.QueueVulkanShader(this, device, scheduler, descriptor_pool,
+                                            update_descriptor_queue, renderpass_cache, bindings,
+                                            program, key);
+        }
+        last_graphics_pipeline = pair->second.get();
+        return last_graphics_pipeline;
+    }
+
     const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key);
     auto& entry = pair->second;
     if (is_cache_miss) {
+        gpu.ShaderNotify().MarkSharderBuilding();
         LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
-        const auto [program, bindings] = DecompileShaders(key);
+        const auto [program, bindings] = DecompileShaders(key.fixed_state);
         entry = std::make_unique<VKGraphicsPipeline>(device, scheduler, descriptor_pool,
                                                      update_descriptor_queue, renderpass_cache, key,
                                                      bindings, program);
+        gpu.ShaderNotify().MarkShaderComplete();
     }
-    return *(last_graphics_pipeline = entry.get());
+    last_graphics_pipeline = entry.get();
+    return last_graphics_pipeline;
 }
 
 VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCacheKey& key) {
@@ -255,29 +247,39 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     }
     LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash());
 
-    auto& memory_manager = system.GPU().MemoryManager();
-    const auto program_addr = key.shader;
+    const GPUVAddr gpu_addr = key.shader;
 
-    const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
     ASSERT(cpu_addr);
 
-    auto shader = cpu_addr ? TryGet(*cpu_addr) : nullptr;
+    Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
     if (!shader) {
         // No shader found - create a new one
-        const auto host_ptr = memory_manager.GetPointer(program_addr);
-
-        auto code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
-        constexpr u32 kernel_main_offset = 0;
-        shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute,
-                                                program_addr, *cpu_addr, std::move(code),
-                                                kernel_main_offset);
-        Register(shader);
-    }
+        const auto host_ptr = gpu_memory.GetPointer(gpu_addr);
 
-    Specialization specialization;
-    specialization.workgroup_size = key.workgroup_size;
-    specialization.shared_memory_size = key.shared_memory_size;
+        ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, true);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+        auto shader_info = std::make_unique<Shader>(kepler_compute, ShaderType::Compute, gpu_addr,
+                                                    *cpu_addr, std::move(code), KERNEL_MAIN_OFFSET);
+        shader = shader_info.get();
 
+        if (cpu_addr) {
+            Register(std::move(shader_info), *cpu_addr, size_in_bytes);
+        } else {
+            null_kernel = std::move(shader_info);
+        }
+    }
+
+    const Specialization specialization{
+        .base_binding = 0,
+        .workgroup_size = key.workgroup_size,
+        .shared_memory_size = key.shared_memory_size,
+        .point_size = std::nullopt,
+        .enabled_attributes = {},
+        .attribute_types = {},
+        .ndc_minus_one_to_one = false,
+    };
     const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute,
                                              shader->GetRegistry(), specialization),
                                    shader->GetEntries()};
@@ -286,7 +288,13 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     return *entry;
 }
 
-void VKPipelineCache::Unregister(const Shader& shader) {
+void VKPipelineCache::EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline) {
+    gpu.ShaderNotify().MarkShaderComplete();
+    std::unique_lock lock{pipeline_cache};
+    graphics_cache.at(pipeline->GetCacheKey()) = std::move(pipeline);
+}
+
+void VKPipelineCache::OnShaderRemoval(Shader* shader) {
     bool finished = false;
     const auto Finish = [&] {
         // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and
@@ -318,25 +326,23 @@ void VKPipelineCache::Unregister(const Shader& shader) {
         Finish();
         it = compute_cache.erase(it);
     }
-
-    RasterizerCache::Unregister(shader);
 }
 
 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
-VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
-    const auto& fixed_state = key.fixed_state;
-    auto& memory_manager = system.GPU().MemoryManager();
-    const auto& gpu = system.GPU().Maxwell3D();
-
+VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) {
     Specialization specialization;
-    if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) {
-        ASSERT(fixed_state.input_assembly.point_size != 0.0f);
-        specialization.point_size = fixed_state.input_assembly.point_size;
+    if (fixed_state.topology == Maxwell::PrimitiveTopology::Points) {
+        float point_size;
+        std::memcpy(&point_size, &fixed_state.point_size, sizeof(float));
+        specialization.point_size = point_size;
+        ASSERT(point_size != 0.0f);
     }
     for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
-        specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type;
+        const auto& attribute = fixed_state.attributes[i];
+        specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
+        specialization.attribute_types[i] = attribute.Type();
     }
-    specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
+    specialization.ndc_minus_one_to_one = fixed_state.ndc_minus_one_to_one;
 
     SPIRVProgram program;
     std::vector<VkDescriptorSetLayoutBinding> bindings;
@@ -345,18 +351,16 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
         const auto program_enum = static_cast<Maxwell::ShaderProgram>(index);
 
         // Skip stages that are not enabled
-        if (!gpu.regs.IsShaderConfigEnabled(index)) {
+        if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
             continue;
         }
 
-        const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-        const auto shader = TryGet(*cpu_addr);
-        ASSERT(shader);
+        const GPUVAddr gpu_addr = GetShaderAddress(maxwell3d, program_enum);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
 
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
-        const auto program_type = GetShaderType(program_enum);
+        const ShaderType program_type = GetShaderType(program_enum);
         const auto& entries = shader->GetEntries();
         program[stage] = {
             Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
@@ -383,14 +387,15 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
 
     if constexpr (descriptor_type == COMBINED_IMAGE_SAMPLER) {
         for (u32 i = 0; i < count; ++i) {
-            const u32 num_samplers = container[i].Size();
-            VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
-            entry.dstBinding = binding;
-            entry.dstArrayElement = 0;
-            entry.descriptorCount = num_samplers;
-            entry.descriptorType = descriptor_type;
-            entry.offset = offset;
-            entry.stride = entry_size;
+            const u32 num_samplers = container[i].size;
+            template_entries.push_back({
+                .dstBinding = binding,
+                .dstArrayElement = 0,
+                .descriptorCount = num_samplers,
+                .descriptorType = descriptor_type,
+                .offset = offset,
+                .stride = entry_size,
+            });
 
             ++binding;
             offset += num_samplers * entry_size;
@@ -398,26 +403,29 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
         return;
     }
 
-    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
-        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-        // crash.
+    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
+                  descriptor_type == STORAGE_TEXEL_BUFFER) {
+        // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
+        // Note: Fixed in driver Windows 443.24, Linux 440.66.15
         for (u32 i = 0; i < count; ++i) {
-            VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
-            entry.dstBinding = binding + i;
-            entry.dstArrayElement = 0;
-            entry.descriptorCount = 1;
-            entry.descriptorType = descriptor_type;
-            entry.offset = offset + i * entry_size;
-            entry.stride = entry_size;
+            template_entries.push_back({
+                .dstBinding = binding + i,
+                .dstArrayElement = 0,
+                .descriptorCount = 1,
+                .descriptorType = descriptor_type,
+                .offset = static_cast<std::size_t>(offset + i * entry_size),
+                .stride = entry_size,
+            });
         }
     } else if (count > 0) {
-        VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
-        entry.dstBinding = binding;
-        entry.dstArrayElement = 0;
-        entry.descriptorCount = count;
-        entry.descriptorType = descriptor_type;
-        entry.offset = offset;
-        entry.stride = entry_size;
+        template_entries.push_back({
+            .dstBinding = binding,
+            .dstArrayElement = 0,
+            .descriptorCount = count,
+            .descriptorType = descriptor_type,
+            .offset = offset,
+            .stride = entry_size,
+        });
     }
     offset += count * entry_size;
     binding += count;
@@ -428,8 +436,9 @@ void FillDescriptorUpdateTemplateEntries(
     std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
     AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
     AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
-    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
     AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
+    AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
     AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 7ccdb7083..e558e6658 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -7,7 +7,6 @@
 #include <array>
 #include <cstddef>
 #include <memory>
-#include <tuple>
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
@@ -18,16 +17,16 @@
 #include "common/common_types.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader/async_shaders.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
-#include "video_core/surface.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -39,54 +38,27 @@ class RasterizerVulkan;
 class VKComputePipeline;
 class VKDescriptorPool;
 class VKDevice;
-class VKFence;
 class VKScheduler;
 class VKUpdateDescriptorQueue;
 
-class CachedShader;
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-using ProgramCode = std::vector<u64>;
+struct ComputePipelineCacheKey {
+    GPUVAddr shader;
+    u32 shared_memory_size;
+    std::array<u32, 3> workgroup_size;
 
-struct GraphicsPipelineCacheKey {
-    FixedPipelineState fixed_state;
-    std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders;
-    RenderPassParams renderpass_params;
+    std::size_t Hash() const noexcept;
 
-    std::size_t Hash() const noexcept {
-        std::size_t hash = fixed_state.Hash();
-        for (const auto& shader : shaders) {
-            boost::hash_combine(hash, shader);
-        }
-        boost::hash_combine(hash, renderpass_params.Hash());
-        return hash;
-    }
+    bool operator==(const ComputePipelineCacheKey& rhs) const noexcept;
 
-    bool operator==(const GraphicsPipelineCacheKey& rhs) const noexcept {
-        return std::tie(fixed_state, shaders, renderpass_params) ==
-               std::tie(rhs.fixed_state, rhs.shaders, rhs.renderpass_params);
-    }
-};
-
-struct ComputePipelineCacheKey {
-    GPUVAddr shader{};
-    u32 shared_memory_size{};
-    std::array<u32, 3> workgroup_size{};
-
-    std::size_t Hash() const noexcept {
-        return static_cast<std::size_t>(shader) ^
-               ((static_cast<std::size_t>(shared_memory_size) >> 7) << 40) ^
-               static_cast<std::size_t>(workgroup_size[0]) ^
-               (static_cast<std::size_t>(workgroup_size[1]) << 16) ^
-               (static_cast<std::size_t>(workgroup_size[2]) << 24);
-    }
-
-    bool operator==(const ComputePipelineCacheKey& rhs) const noexcept {
-        return std::tie(shader, shared_memory_size, workgroup_size) ==
-               std::tie(rhs.shader, rhs.shared_memory_size, rhs.workgroup_size);
+    bool operator!=(const ComputePipelineCacheKey& rhs) const noexcept {
+        return !operator==(rhs);
     }
 };
+static_assert(std::has_unique_object_representations_v<ComputePipelineCacheKey>);
+static_assert(std::is_trivially_copyable_v<ComputePipelineCacheKey>);
+static_assert(std::is_trivially_constructible_v<ComputePipelineCacheKey>);
 
 } // namespace Vulkan
 
@@ -110,21 +82,22 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
 
 namespace Vulkan {
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader {
 public:
-    explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
-                          VAddr cpu_addr, ProgramCode program_code, u32 main_offset);
-    ~CachedShader();
+    explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine,
+                    Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, VAddr cpu_addr,
+                    VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
+    ~Shader();
 
     GPUVAddr GetGpuAddr() const {
         return gpu_addr;
     }
 
-    std::size_t GetSizeInBytes() const override {
-        return program_code.size() * sizeof(u64);
+    VideoCommon::Shader::ShaderIR& GetIR() {
+        return shader_ir;
     }
 
-    VideoCommon::Shader::ShaderIR& GetIR() {
+    const VideoCommon::Shader::ShaderIR& GetIR() const {
         return shader_ir;
     }
 
@@ -132,61 +105,65 @@ public:
         return registry;
     }
 
-    const VideoCommon::Shader::ShaderIR& GetIR() const {
-        return shader_ir;
-    }
-
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
 private:
-    static Tegra::Engines::ConstBufferEngineInterface& GetEngine(Core::System& system,
-                                                                 Tegra::Engines::ShaderType stage);
-
     GPUVAddr gpu_addr{};
-    ProgramCode program_code;
+    VideoCommon::Shader::ProgramCode program_code;
     VideoCommon::Shader::Registry registry;
     VideoCommon::Shader::ShaderIR shader_ir;
     ShaderEntries entries;
 };
 
-class VKPipelineCache final : public RasterizerCache<Shader> {
+class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
 public:
-    explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
-                             const VKDevice& device, VKScheduler& scheduler,
-                             VKDescriptorPool& descriptor_pool,
+    explicit VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu,
+                             Tegra::Engines::Maxwell3D& maxwell3d,
+                             Tegra::Engines::KeplerCompute& kepler_compute,
+                             Tegra::MemoryManager& gpu_memory, const VKDevice& device,
+                             VKScheduler& scheduler, VKDescriptorPool& descriptor_pool,
                              VKUpdateDescriptorQueue& update_descriptor_queue,
                              VKRenderPassCache& renderpass_cache);
-    ~VKPipelineCache();
+    ~VKPipelineCache() override;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> GetShaders();
+    std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();
 
-    VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);
+    VKGraphicsPipeline* GetGraphicsPipeline(const GraphicsPipelineCacheKey& key,
+                                            VideoCommon::Shader::AsyncShaders& async_shaders);
 
     VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);
 
-protected:
-    void Unregister(const Shader& shader) override;
+    void EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline);
 
-    void FlushObjectInner(const Shader& object) override {}
+protected:
+    void OnShaderRemoval(Shader* shader) final;
 
 private:
     std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
-        const GraphicsPipelineCacheKey& key);
+        const FixedPipelineState& fixed_state);
+
+    Tegra::GPU& gpu;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
 
-    Core::System& system;
     const VKDevice& device;
     VKScheduler& scheduler;
     VKDescriptorPool& descriptor_pool;
     VKUpdateDescriptorQueue& update_descriptor_queue;
     VKRenderPassCache& renderpass_cache;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
+
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 
     GraphicsPipelineCacheKey last_graphics_key;
     VKGraphicsPipeline* last_graphics_pipeline = nullptr;
 
+    std::mutex pipeline_cache;
     std::unordered_map<GraphicsPipelineCacheKey, std::unique_ptr<VKGraphicsPipeline>>
         graphics_cache;
     std::unordered_map<ComputePipelineCacheKey, std::unique_ptr<VKComputePipeline>> compute_cache;
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index 0966c7ff7..ee2d871e3 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -4,41 +4,38 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <cstdint>
 #include <utility>
 #include <vector>
 
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
+using VideoCore::QueryType;
+
 namespace {
 
 constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION};
 
-constexpr VkQueryType GetTarget(VideoCore::QueryType type) {
+constexpr VkQueryType GetTarget(QueryType type) {
     return QUERY_TARGETS[static_cast<std::size_t>(type)];
 }
 
 } // Anonymous namespace
 
-QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+QueryPool::QueryPool(const VKDevice& device_, VKScheduler& scheduler, QueryType type_)
+    : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {}
 
 QueryPool::~QueryPool() = default;
 
-void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
-    device = &device_;
-    type = type_;
-}
-
-std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) {
+std::pair<VkQueryPool, u32> QueryPool::Commit() {
     std::size_t index;
     do {
-        index = CommitResource(fence);
+        index = CommitResource();
     } while (usage[index]);
     usage[index] = true;
 
@@ -48,14 +45,14 @@ std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) {
 void QueryPool::Allocate(std::size_t begin, std::size_t end) {
     usage.resize(end);
 
-    VkQueryPoolCreateInfo query_pool_ci;
-    query_pool_ci.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
-    query_pool_ci.pNext = nullptr;
-    query_pool_ci.flags = 0;
-    query_pool_ci.queryType = GetTarget(type);
-    query_pool_ci.queryCount = static_cast<u32>(end - begin);
-    query_pool_ci.pipelineStatistics = 0;
-    pools.push_back(device->GetLogical().CreateQueryPool(query_pool_ci));
+    pools.push_back(device.GetLogical().CreateQueryPool({
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .queryType = GetTarget(type),
+        .queryCount = static_cast<u32>(end - begin),
+        .pipelineStatistics = 0,
+    }));
 }
 
 void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) {
@@ -69,30 +66,39 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) {
     usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
 }
 
-VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer,
+                           Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
                            const VKDevice& device, VKScheduler& scheduler)
-    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
-                                  QueryPool>{system, rasterizer},
-      device{device}, scheduler{scheduler} {
-    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
-        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream,
+                                  HostCounter>{rasterizer, maxwell3d, gpu_memory},
+      device{device}, scheduler{scheduler}, query_pools{
+                                                QueryPool{device, scheduler,
+                                                          QueryType::SamplesPassed},
+                                            } {}
+
+VKQueryCache::~VKQueryCache() {
+    // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class
+    // destructor is called. The query cache should be redesigned to have a proper ownership model
+    // instead of using shared pointers.
+    for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) {
+        auto& stream = Stream(static_cast<QueryType>(query_type));
+        stream.Update(false);
+        stream.Reset();
     }
 }
 
-VKQueryCache::~VKQueryCache() = default;
-
-std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
-    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit();
 }
 
-void VKQueryCache::Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query) {
+void VKQueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) {
     query_pools[static_cast<std::size_t>(type)].Reserve(query);
 }
 
 HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
-                         VideoCore::QueryType type)
+                         QueryType type)
     : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
-      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+      type{type}, query{cache.AllocateQuery(type)}, tick{cache.Scheduler().CurrentTick()} {
     const vk::Device* logical = &cache.Device().GetLogical();
     cache.Scheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) {
         logical->ResetQueryPoolEXT(query.first, query.second, 1);
@@ -110,11 +116,22 @@ void HostCounter::EndQuery() {
 }
 
 u64 HostCounter::BlockingQuery() const {
-    if (ticks >= cache.Scheduler().Ticks()) {
+    if (tick >= cache.Scheduler().CurrentTick()) {
         cache.Scheduler().Flush();
     }
-    return cache.Device().GetLogical().GetQueryResult<u64>(
-        query.first, query.second, VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+    u64 data;
+    const VkResult result = cache.Device().GetLogical().GetQueryResults(
+        query.first, query.second, 1, sizeof(data), &data, sizeof(data),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+    switch (result) {
+    case VK_SUCCESS:
+        return data;
+    case VK_ERROR_DEVICE_LOST:
+        cache.Device().ReportLoss();
+        [[fallthrough]];
+    default:
+        throw vk::Exception(result);
+    }
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
index b63784f4b..2e57fb75d 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.h
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -5,14 +5,13 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "common/common_types.h"
 #include "video_core/query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace VideoCore {
@@ -29,14 +28,12 @@ class VKScheduler;
 
 using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
 
-class QueryPool final : public VKFencedPool {
+class QueryPool final : public ResourcePool {
 public:
-    explicit QueryPool();
+    explicit QueryPool(const VKDevice& device, VKScheduler& scheduler, VideoCore::QueryType type);
     ~QueryPool() override;
 
-    void Initialize(const VKDevice& device, VideoCore::QueryType type);
-
-    std::pair<VkQueryPool, u32> Commit(VKFence& fence);
+    std::pair<VkQueryPool, u32> Commit();
 
     void Reserve(std::pair<VkQueryPool, u32> query);
 
@@ -46,18 +43,18 @@ protected:
 private:
     static constexpr std::size_t GROW_STEP = 512;
 
-    const VKDevice* device = nullptr;
-    VideoCore::QueryType type = {};
+    const VKDevice& device;
+    const VideoCore::QueryType type;
 
     std::vector<vk::QueryPool> pools;
     std::vector<bool> usage;
 };
 
 class VKQueryCache final
-    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
-                                         QueryPool> {
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter> {
 public:
-    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+    explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer,
+                          Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
                           const VKDevice& device, VKScheduler& scheduler);
     ~VKQueryCache();
 
@@ -76,6 +73,7 @@ public:
 private:
     const VKDevice& device;
     VKScheduler& scheduler;
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
 };
 
 class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
@@ -92,7 +90,7 @@ private:
     VKQueryCache& cache;
     const VideoCore::QueryType type;
     const std::pair<VkQueryPool, u32> query;
-    const u64 ticks;
+    const u64 tick;
 };
 
 class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 774ba1f26..e0fb8693f 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -9,14 +9,14 @@
 #include <vector>
 
 #include <boost/container/static_vector.hpp>
-#include <boost/functional/hash.hpp>
 
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
+#include "common/scope_exit.h"
 #include "core/core.h"
-#include "core/memory.h"
+#include "core/settings.h"
 #include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
@@ -31,7 +31,6 @@
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@@ -39,6 +38,7 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader_cache.h"
 
 namespace Vulkan {
 
@@ -64,20 +64,22 @@ VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::si
     const auto& src = regs.viewport_transform[index];
     const float width = src.scale_x * 2.0f;
     const float height = src.scale_y * 2.0f;
+    const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
 
-    VkViewport viewport;
-    viewport.x = src.translate_x - src.scale_x;
-    viewport.y = src.translate_y - src.scale_y;
-    viewport.width = width != 0.0f ? width : 1.0f;
-    viewport.height = height != 0.0f ? height : 1.0f;
+    VkViewport viewport{
+        .x = src.translate_x - src.scale_x,
+        .y = src.translate_y - src.scale_y,
+        .width = width != 0.0f ? width : 1.0f,
+        .height = height != 0.0f ? height : 1.0f,
+        .minDepth = src.translate_z - src.scale_z * reduce_z,
+        .maxDepth = src.translate_z + src.scale_z,
+    };
 
-    const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
-    viewport.minDepth = src.translate_z - src.scale_z * reduce_z;
-    viewport.maxDepth = src.translate_z + src.scale_z;
     if (!device.IsExtDepthRangeUnrestrictedSupported()) {
         viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f);
         viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f);
     }
+
     return viewport;
 }
 
@@ -99,7 +101,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {
 }
 
 std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;
     for (std::size_t i = 0; i < std::size(addresses); ++i) {
         addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0;
@@ -118,14 +120,24 @@ template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                std::size_t stage, std::size_t index = 0) {
     const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
-    if (entry.IsBindless()) {
-        const Tegra::Texture::TextureHandle tex_handle =
-            engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset());
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
+    if (entry.is_bindless) {
+        const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
         return engine.GetTextureInfo(tex_handle);
     }
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
     const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
-    const u32 offset = entry.GetOffset() + entry_offset;
+    const u32 offset = entry.offset + entry_offset;
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
         return engine.GetStageTexture(stage_type, offset);
     } else {
@@ -133,92 +145,144 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
     }
 }
 
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+    if (!is_clear) {
+        return true;
+    }
+    // First we have to make sure all clear masks are enabled.
+    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+        !regs.clear_buffers.A) {
+        return true;
+    }
+    // If scissors are disabled, the whole screen is cleared
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Then we have to confirm scissor testing clears the whole image
+    const std::size_t index = regs.clear_buffers.RT;
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+           scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+    // If we are not clearing, the contents have to be preserved
+    if (!is_clear) {
+        return true;
+    }
+    // For depth stencil clears we only have to confirm scissor test covers the whole image
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Make sure the clear cover the whole image
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+           scissor.max_y < regs.zeta_height;
+}
+
+template <std::size_t N>
+std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) {
+    std::array<VkDeviceSize, N> expanded;
+    std::copy(strides.begin(), strides.end(), expanded.begin());
+    return expanded;
+}
+
 } // Anonymous namespace
 
 class BufferBindings final {
 public:
-    void AddVertexBinding(const VkBuffer* buffer, VkDeviceSize offset) {
-        vertex.buffer_ptrs[vertex.num_buffers] = buffer;
+    void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) {
+        vertex.buffers[vertex.num_buffers] = buffer;
         vertex.offsets[vertex.num_buffers] = offset;
+        vertex.sizes[vertex.num_buffers] = size;
+        vertex.strides[vertex.num_buffers] = static_cast<u16>(stride);
         ++vertex.num_buffers;
     }
 
-    void SetIndexBinding(const VkBuffer* buffer, VkDeviceSize offset, VkIndexType type) {
+    void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) {
         index.buffer = buffer;
         index.offset = offset;
         index.type = type;
     }
 
-    void Bind(VKScheduler& scheduler) const {
+    void Bind(const VKDevice& device, VKScheduler& scheduler) const {
         // Use this large switch case to avoid dispatching more memory in the record lambda than
         // what we need. It looks horrible, but it's the best we can do on standard C++.
         switch (vertex.num_buffers) {
         case 0:
-            return BindStatic<0>(scheduler);
+            return BindStatic<0>(device, scheduler);
         case 1:
-            return BindStatic<1>(scheduler);
+            return BindStatic<1>(device, scheduler);
         case 2:
-            return BindStatic<2>(scheduler);
+            return BindStatic<2>(device, scheduler);
         case 3:
-            return BindStatic<3>(scheduler);
+            return BindStatic<3>(device, scheduler);
         case 4:
-            return BindStatic<4>(scheduler);
+            return BindStatic<4>(device, scheduler);
         case 5:
-            return BindStatic<5>(scheduler);
+            return BindStatic<5>(device, scheduler);
         case 6:
-            return BindStatic<6>(scheduler);
+            return BindStatic<6>(device, scheduler);
         case 7:
-            return BindStatic<7>(scheduler);
+            return BindStatic<7>(device, scheduler);
         case 8:
-            return BindStatic<8>(scheduler);
+            return BindStatic<8>(device, scheduler);
         case 9:
-            return BindStatic<9>(scheduler);
+            return BindStatic<9>(device, scheduler);
         case 10:
-            return BindStatic<10>(scheduler);
+            return BindStatic<10>(device, scheduler);
         case 11:
-            return BindStatic<11>(scheduler);
+            return BindStatic<11>(device, scheduler);
         case 12:
-            return BindStatic<12>(scheduler);
+            return BindStatic<12>(device, scheduler);
         case 13:
-            return BindStatic<13>(scheduler);
+            return BindStatic<13>(device, scheduler);
         case 14:
-            return BindStatic<14>(scheduler);
+            return BindStatic<14>(device, scheduler);
         case 15:
-            return BindStatic<15>(scheduler);
+            return BindStatic<15>(device, scheduler);
         case 16:
-            return BindStatic<16>(scheduler);
+            return BindStatic<16>(device, scheduler);
         case 17:
-            return BindStatic<17>(scheduler);
+            return BindStatic<17>(device, scheduler);
         case 18:
-            return BindStatic<18>(scheduler);
+            return BindStatic<18>(device, scheduler);
         case 19:
-            return BindStatic<19>(scheduler);
+            return BindStatic<19>(device, scheduler);
         case 20:
-            return BindStatic<20>(scheduler);
+            return BindStatic<20>(device, scheduler);
         case 21:
-            return BindStatic<21>(scheduler);
+            return BindStatic<21>(device, scheduler);
         case 22:
-            return BindStatic<22>(scheduler);
+            return BindStatic<22>(device, scheduler);
         case 23:
-            return BindStatic<23>(scheduler);
+            return BindStatic<23>(device, scheduler);
         case 24:
-            return BindStatic<24>(scheduler);
+            return BindStatic<24>(device, scheduler);
         case 25:
-            return BindStatic<25>(scheduler);
+            return BindStatic<25>(device, scheduler);
         case 26:
-            return BindStatic<26>(scheduler);
+            return BindStatic<26>(device, scheduler);
         case 27:
-            return BindStatic<27>(scheduler);
+            return BindStatic<27>(device, scheduler);
         case 28:
-            return BindStatic<28>(scheduler);
+            return BindStatic<28>(device, scheduler);
         case 29:
-            return BindStatic<29>(scheduler);
+            return BindStatic<29>(device, scheduler);
         case 30:
-            return BindStatic<30>(scheduler);
+            return BindStatic<30>(device, scheduler);
         case 31:
-            return BindStatic<31>(scheduler);
+            return BindStatic<31>(device, scheduler);
         case 32:
-            return BindStatic<32>(scheduler);
+            return BindStatic<32>(device, scheduler);
         }
         UNREACHABLE();
     }
@@ -227,26 +291,36 @@ private:
     // Some of these fields are intentionally left uninitialized to avoid initializing them twice.
     struct {
         std::size_t num_buffers = 0;
-        std::array<const VkBuffer*, Maxwell::NumVertexArrays> buffer_ptrs;
+        std::array<VkBuffer, Maxwell::NumVertexArrays> buffers;
         std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets;
+        std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes;
+        std::array<u16, Maxwell::NumVertexArrays> strides;
     } vertex;
 
     struct {
-        const VkBuffer* buffer = nullptr;
+        VkBuffer buffer = nullptr;
         VkDeviceSize offset;
         VkIndexType type;
     } index;
 
     template <std::size_t N>
-    void BindStatic(VKScheduler& scheduler) const {
-        if (index.buffer != nullptr) {
-            BindStatic<N, true>(scheduler);
+    void BindStatic(const VKDevice& device, VKScheduler& scheduler) const {
+        if (device.IsExtExtendedDynamicStateSupported()) {
+            if (index.buffer) {
+                BindStatic<N, true, true>(scheduler);
+            } else {
+                BindStatic<N, false, true>(scheduler);
+            }
         } else {
-            BindStatic<N, false>(scheduler);
+            if (index.buffer) {
+                BindStatic<N, true, false>(scheduler);
+            } else {
+                BindStatic<N, false, false>(scheduler);
+            }
         }
     }
 
-    template <std::size_t N, bool is_indexed>
+    template <std::size_t N, bool is_indexed, bool has_extended_dynamic_state>
     void BindStatic(VKScheduler& scheduler) const {
         static_assert(N <= Maxwell::NumVertexArrays);
         if constexpr (N == 0) {
@@ -254,18 +328,39 @@ private:
         }
 
         std::array<VkBuffer, N> buffers;
-        std::transform(vertex.buffer_ptrs.begin(), vertex.buffer_ptrs.begin() + N, buffers.begin(),
-                       [](const auto ptr) { return *ptr; });
-
         std::array<VkDeviceSize, N> offsets;
+        std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin());
         std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin());
 
+        if constexpr (has_extended_dynamic_state) {
+            // With extended dynamic states we can specify the length and stride of a vertex buffer
+            std::array<VkDeviceSize, N> sizes;
+            std::array<u16, N> strides;
+            std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin());
+            std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin());
+
+            if constexpr (is_indexed) {
+                scheduler.Record(
+                    [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) {
+                        cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
+                        cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
+                                                     offsets.data(), sizes.data(),
+                                                     ExpandStrides(strides).data());
+                    });
+            } else {
+                scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) {
+                    cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
+                                                 offsets.data(), sizes.data(),
+                                                 ExpandStrides(strides).data());
+                });
+            }
+            return;
+        }
+
         if constexpr (is_indexed) {
             // Indexed draw
-            scheduler.Record([buffers, offsets, index_buffer = *index.buffer,
-                              index_offset = index.offset,
-                              index_type = index.type](vk::CommandBuffer cmdbuf) {
-                cmdbuf.BindIndexBuffer(index_buffer, index_offset, index_type);
+            scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) {
+                cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
                 cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
             });
         } else {
@@ -285,25 +380,32 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const {
     }
 }
 
-RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer,
-                                   VKScreenInfo& screen_info, const VKDevice& device,
-                                   VKResourceManager& resource_manager,
-                                   VKMemoryManager& memory_manager, StateTracker& state_tracker,
-                                   VKScheduler& scheduler)
-    : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer},
-      screen_info{screen_info}, device{device}, resource_manager{resource_manager},
-      memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler},
-      staging_pool(device, memory_manager, scheduler), descriptor_pool(device),
-      update_descriptor_queue(device, scheduler), renderpass_cache(device),
+RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
+                                   Tegra::MemoryManager& gpu_memory_,
+                                   Core::Memory::Memory& cpu_memory, VKScreenInfo& screen_info_,
+                                   const VKDevice& device_, VKMemoryManager& memory_manager_,
+                                   StateTracker& state_tracker_, VKScheduler& scheduler_)
+    : RasterizerAccelerated(cpu_memory), gpu(gpu_), gpu_memory(gpu_memory_),
+      maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), screen_info(screen_info_),
+      device(device_), memory_manager(memory_manager_), state_tracker(state_tracker_),
+      scheduler(scheduler_), staging_pool(device, memory_manager, scheduler),
+      descriptor_pool(device, scheduler_), update_descriptor_queue(device, scheduler),
+      renderpass_cache(device),
       quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
+      quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
       uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
-      texture_cache(system, *this, device, resource_manager, memory_manager, scheduler,
-                    staging_pool),
-      pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue,
-                     renderpass_cache),
-      buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device), query_cache(system, *this, device, scheduler) {
+      texture_cache(*this, maxwell3d, gpu_memory, device, memory_manager, scheduler, staging_pool),
+      pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
+                     descriptor_pool, update_descriptor_queue, renderpass_cache),
+      buffer_cache(*this, gpu_memory, cpu_memory, device, memory_manager, scheduler, staging_pool),
+      sampler_cache(device), query_cache(*this, maxwell3d, gpu_memory, device, scheduler),
+      fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, device,
+                    scheduler),
+      wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window) {
     scheduler.SetQueryCache(query_cache);
+    if (device.UseAsynchronousShaders()) {
+        async_shaders.AllocateWorkers();
+    }
 }
 
 RasterizerVulkan::~RasterizerVulkan() = default;
@@ -311,12 +413,13 @@ RasterizerVulkan::~RasterizerVulkan() = default;
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
+    SCOPE_EXIT({ gpu.TickWork(); });
     FlushWork();
 
     query_cache.UpdateCounters();
 
-    const auto& gpu = system.GPU().Maxwell3D();
-    GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)};
+    GraphicsPipelineCacheKey key;
+    key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported());
 
     buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed));
 
@@ -334,31 +437,32 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     buffer_cache.Unmap();
 
-    const auto texceptions = UpdateAttachments();
+    const Texceptions texceptions = UpdateAttachments(false);
     SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
 
     key.renderpass_params = GetRenderPassParams(texceptions);
+    key.padding = 0;
+
+    auto* pipeline = pipeline_cache.GetGraphicsPipeline(key, async_shaders);
+    if (pipeline == nullptr || pipeline->GetHandle() == VK_NULL_HANDLE) {
+        // Async graphics pipeline was not ready.
+        return;
+    }
 
-    auto& pipeline = pipeline_cache.GetGraphicsPipeline(key);
-    scheduler.BindGraphicsPipeline(pipeline.GetHandle());
+    scheduler.BindGraphicsPipeline(pipeline->GetHandle());
 
-    const auto renderpass = pipeline.GetRenderPass();
+    const auto renderpass = pipeline->GetRenderPass();
     const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass);
     scheduler.RequestRenderpass(renderpass, framebuffer, render_area);
 
     UpdateDynamicStates();
 
-    buffer_bindings.Bind(scheduler);
-
-    if (device.IsNvDeviceDiagnosticCheckpoints()) {
-        scheduler.Record(
-            [&pipeline](vk::CommandBuffer cmdbuf) { cmdbuf.SetCheckpointNV(&pipeline); });
-    }
+    buffer_bindings.Bind(device, scheduler);
 
     BeginTransformFeedback();
 
-    const auto pipeline_layout = pipeline.GetLayout();
-    const auto descriptor_set = pipeline.CommitDescriptorSet();
+    const auto pipeline_layout = pipeline->GetLayout();
+    const auto descriptor_set = pipeline->CommitDescriptorSet();
     scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) {
         if (descriptor_set) {
             cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout,
@@ -373,8 +477,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
-    const auto& gpu = system.GPU().Maxwell3D();
-    if (!system.GPU().Maxwell3D().ShouldExecute()) {
+    if (!maxwell3d.ShouldExecute()) {
         return;
     }
 
@@ -383,7 +486,7 @@ void RasterizerVulkan::Clear() {
 
     query_cache.UpdateCounters();
 
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
     const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
                            regs.clear_buffers.A;
     const bool use_depth = regs.clear_buffers.Z;
@@ -392,7 +495,7 @@ void RasterizerVulkan::Clear() {
         return;
     }
 
-    [[maybe_unused]] const auto texceptions = UpdateAttachments();
+    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
     DEBUG_ASSERT(texceptions.none());
     SetupImageTransitions(0, color_attachments, zeta_attachment);
 
@@ -413,10 +516,11 @@ void RasterizerVulkan::Clear() {
 
         const u32 color_attachment = regs.clear_buffers.RT;
         scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) {
-            VkClearAttachment attachment;
-            attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-            attachment.colorAttachment = color_attachment;
-            attachment.clearValue = clear_value;
+            const VkClearAttachment attachment{
+                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                .colorAttachment = color_attachment,
+                .clearValue = clear_value,
+            };
             cmdbuf.ClearAttachments(attachment, clear_rect);
         });
     }
@@ -434,10 +538,6 @@ void RasterizerVulkan::Clear() {
 
     scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
                       clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) {
-        VkClearValue clear_value;
-        clear_value.depthStencil.depth = clear_depth;
-        clear_value.depthStencil.stencil = clear_stencil;
-
         VkClearAttachment attachment;
         attachment.aspectMask = aspect_flags;
         attachment.colorAttachment = 0;
@@ -455,12 +555,17 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
 
     query_cache.UpdateCounters();
 
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
-    const ComputePipelineCacheKey key{
-        code_addr,
-        launch_desc.shared_alloc,
-        {launch_desc.block_dim_x, launch_desc.block_dim_y, launch_desc.block_dim_z}};
-    auto& pipeline = pipeline_cache.GetComputePipeline(key);
+    const auto& launch_desc = kepler_compute.launch_description;
+    auto& pipeline = pipeline_cache.GetComputePipeline({
+        .shader = code_addr,
+        .shared_memory_size = launch_desc.shared_alloc,
+        .workgroup_size =
+            {
+                launch_desc.block_dim_x,
+                launch_desc.block_dim_y,
+                launch_desc.block_dim_z,
+            },
+    });
 
     // Compute dispatches can't be executed inside a renderpass
     scheduler.RequestOutsideRenderPassOperationContext();
@@ -470,8 +575,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     const auto& entries = pipeline.GetEntries();
     SetupComputeConstBuffers(entries);
     SetupComputeGlobalBuffers(entries);
-    SetupComputeTexelBuffers(entries);
+    SetupComputeUniformTexels(entries);
     SetupComputeTextures(entries);
+    SetupComputeStorageTexels(entries);
     SetupComputeImages(entries);
 
     buffer_cache.Unmap();
@@ -481,11 +587,6 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     TransitionImages(image_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                      VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
 
-    if (device.IsNvDeviceDiagnosticCheckpoints()) {
-        scheduler.Record(
-            [&pipeline](vk::CommandBuffer cmdbuf) { cmdbuf.SetCheckpointNV(nullptr); });
-    }
-
     scheduler.Record([grid_x = launch_desc.grid_dim_x, grid_y = launch_desc.grid_dim_y,
                       grid_z = launch_desc.grid_dim_z, pipeline_handle = pipeline.GetHandle(),
                       layout = pipeline.GetLayout(),
@@ -517,6 +618,13 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
     query_cache.FlushRegion(addr, size);
 }
 
+bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) {
+    if (!Settings::IsGPULevelHigh()) {
+        return buffer_cache.MustFlushRegion(addr, size);
+    }
+    return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size);
+}
+
 void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return;
@@ -527,11 +635,71 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
     query_cache.InvalidateRegion(addr, size);
 }
 
+void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
+    if (addr == 0 || size == 0) {
+        return;
+    }
+    texture_cache.OnCPUWrite(addr, size);
+    pipeline_cache.OnCPUWrite(addr, size);
+    buffer_cache.OnCPUWrite(addr, size);
+}
+
+void RasterizerVulkan::SyncGuestHost() {
+    texture_cache.SyncGuestHost();
+    buffer_cache.SyncGuestHost();
+    pipeline_cache.SyncGuestHost();
+}
+
+void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
+    if (!gpu.IsAsync()) {
+        gpu_memory.Write<u32>(addr, value);
+        return;
+    }
+    fence_manager.SignalSemaphore(addr, value);
+}
+
+void RasterizerVulkan::SignalSyncPoint(u32 value) {
+    if (!gpu.IsAsync()) {
+        gpu.IncrementSyncPoint(value);
+        return;
+    }
+    fence_manager.SignalSyncPoint(value);
+}
+
+void RasterizerVulkan::ReleaseFences() {
+    if (!gpu.IsAsync()) {
+        return;
+    }
+    fence_manager.WaitPendingFences();
+}
+
 void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    FlushRegion(addr, size);
+    if (Settings::IsGPULevelExtreme()) {
+        FlushRegion(addr, size);
+    }
     InvalidateRegion(addr, size);
 }
 
+void RasterizerVulkan::WaitForIdle() {
+    // Everything but wait pixel operations. This intentionally includes FRAGMENT_SHADER_BIT because
+    // fragment shaders can still write storage buffers.
+    VkPipelineStageFlags flags =
+        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
+        VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+        VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+        VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT;
+    if (device.IsExtTransformFeedbackSupported()) {
+        flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT;
+    }
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetEvent(event, flags);
+        cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {});
+    });
+}
+
 void RasterizerVulkan::FlushCommands() {
     if (draw_counter > 0) {
         draw_counter = 0;
@@ -576,10 +744,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerVulkan::SetupDirtyFlags() {
-    state_tracker.Initialize();
-}
-
 void RasterizerVulkan::FlushWork() {
     static constexpr u32 DRAWS_TO_DISPATCH = 4096;
 
@@ -601,9 +765,11 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+
+    const auto& regs = maxwell3d.regs;
+    auto& dirty = maxwell3d.dirty.flags;
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -612,7 +778,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     Texceptions texceptions;
     for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
         if (update_rendertargets) {
-            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt);
+            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
             texceptions[rt] = true;
@@ -620,7 +787,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     }
 
     if (update_rendertargets) {
-        zeta_attachment = texture_cache.GetDepthBufferSurface();
+        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
         texceptions[ZETA_TEXCEPTION_INDEX] = true;
@@ -645,21 +813,28 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen
 
 std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
     VkRenderPass renderpass) {
-    FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(),
-                            std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
+    FramebufferCacheKey key{
+        .renderpass = renderpass,
+        .width = std::numeric_limits<u32>::max(),
+        .height = std::numeric_limits<u32>::max(),
+        .layers = std::numeric_limits<u32>::max(),
+        .views = {},
+    };
 
-    const auto try_push = [&](const View& view) {
+    const auto try_push = [&key](const View& view) {
         if (!view) {
             return false;
         }
-        key.views.push_back(view->GetHandle());
+        key.views.push_back(view->GetAttachment());
         key.width = std::min(key.width, view->GetWidth());
         key.height = std::min(key.height, view->GetHeight());
         key.layers = std::min(key.layers, view->GetNumLayers());
         return true;
     };
 
-    for (std::size_t index = 0; index < std::size(color_attachments); ++index) {
+    const auto& regs = maxwell3d.regs;
+    const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count);
+    for (std::size_t index = 0; index < num_attachments; ++index) {
         if (try_push(color_attachments[index])) {
             texture_cache.MarkColorBufferInUse(index);
         }
@@ -671,17 +846,17 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
     const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key);
     auto& framebuffer = fbentry->second;
     if (is_cache_miss) {
-        VkFramebufferCreateInfo framebuffer_ci;
-        framebuffer_ci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
-        framebuffer_ci.pNext = nullptr;
-        framebuffer_ci.flags = 0;
-        framebuffer_ci.renderPass = key.renderpass;
-        framebuffer_ci.attachmentCount = static_cast<u32>(key.views.size());
-        framebuffer_ci.pAttachments = key.views.data();
-        framebuffer_ci.width = key.width;
-        framebuffer_ci.height = key.height;
-        framebuffer_ci.layers = key.layers;
-        framebuffer = device.GetLogical().CreateFramebuffer(framebuffer_ci);
+        framebuffer = device.GetLogical().CreateFramebuffer({
+            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+            .pNext = nullptr,
+            .flags = 0,
+            .renderPass = key.renderpass,
+            .attachmentCount = static_cast<u32>(key.views.size()),
+            .pAttachments = key.views.data(),
+            .width = key.width,
+            .height = key.height,
+            .layers = key.layers,
+        });
     }
 
     return {*framebuffer, VkExtent2D{key.width, key.height}};
@@ -693,13 +868,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
                                                                  bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Geometry);
 
-    const auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
+    const auto& regs = maxwell3d.regs;
 
-    SetupVertexArrays(fixed_state.vertex_input, buffer_bindings);
+    SetupVertexArrays(buffer_bindings);
 
     const u32 base_instance = regs.vb_base_instance;
-    const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1;
+    const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1;
     const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first;
     const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count;
 
@@ -710,20 +884,21 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
 }
 
 void RasterizerVulkan::SetupShaderDescriptors(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     texture_cache.GuardSamplers(true);
 
     for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
         // Skip VertexA stage
-        const auto& shader = shaders[stage + 1];
+        Shader* const shader = shaders[stage + 1];
         if (!shader) {
             continue;
         }
         const auto& entries = shader->GetEntries();
         SetupGraphicsConstBuffers(entries, stage);
         SetupGraphicsGlobalBuffers(entries, stage);
-        SetupGraphicsTexelBuffers(entries, stage);
+        SetupGraphicsUniformTexels(entries, stage);
         SetupGraphicsTextures(entries, stage);
+        SetupGraphicsStorageTexels(entries, stage);
         SetupGraphicsImages(entries, stage);
     }
     texture_cache.GuardSamplers(false);
@@ -759,20 +934,34 @@ void RasterizerVulkan::SetupImageTransitions(
 }
 
 void RasterizerVulkan::UpdateDynamicStates() {
-    auto& regs = system.GPU().Maxwell3D().regs;
+    auto& regs = maxwell3d.regs;
     UpdateViewportsState(regs);
     UpdateScissorsState(regs);
     UpdateDepthBias(regs);
     UpdateBlendConstants(regs);
     UpdateDepthBounds(regs);
     UpdateStencilFaces(regs);
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        UpdateCullMode(regs);
+        UpdateDepthBoundsTestEnable(regs);
+        UpdateDepthTestEnable(regs);
+        UpdateDepthWriteEnable(regs);
+        UpdateDepthCompareOp(regs);
+        UpdateFrontFace(regs);
+        UpdateStencilOp(regs);
+        UpdateStencilTestEnable(regs);
+    }
 }
 
 void RasterizerVulkan::BeginTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
+        return;
+    }
 
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -787,90 +976,92 @@ void RasterizerVulkan::BeginTransformFeedback() {
     UNIMPLEMENTED_IF(binding.buffer_offset != 0);
 
     const GPUVAddr gpu_addr = binding.Address();
-    const std::size_t size = binding.buffer_size;
-    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
 
-    scheduler.Record([buffer = *buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
         cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
         cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
     });
 }
 
 void RasterizerVulkan::EndTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        return;
+    }
 
     scheduler.Record(
         [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
 }
 
-void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
-                                         BufferBindings& buffer_bindings) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-
-    for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexAttributes); ++index) {
-        const auto& attrib = regs.vertex_attrib_format[index];
-        if (!attrib.IsValid()) {
-            continue;
-        }
+void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
+    const auto& regs = maxwell3d.regs;
 
-        const auto& buffer = regs.vertex_array[attrib.buffer];
-        ASSERT(buffer.IsEnabled());
-
-        vertex_input.attributes[vertex_input.num_attributes++] =
-            FixedPipelineState::VertexAttribute(index, attrib.buffer, attrib.type, attrib.size,
-                                                attrib.offset);
-    }
-
-    for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexArrays); ++index) {
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
         const auto& vertex_array = regs.vertex_array[index];
         if (!vertex_array.IsEnabled()) {
             continue;
         }
-
         const GPUVAddr start{vertex_array.StartAddress()};
         const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
 
-        ASSERT(end > start);
-        const std::size_t size{end - start + 1};
-        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
-
-        vertex_input.bindings[vertex_input.num_bindings++] = FixedPipelineState::VertexBinding(
-            index, vertex_array.stride,
-            regs.instanced_arrays.IsInstancingEnabled(index) ? vertex_array.divisor : 0);
-        buffer_bindings.AddVertexBinding(buffer, offset);
+        ASSERT(end >= start);
+        const std::size_t size = end - start;
+        if (size == 0) {
+            buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0);
+            continue;
+        }
+        const auto info = buffer_cache.UploadMemory(start, size);
+        buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride);
     }
 }
 
 void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params,
                                         bool is_indexed) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (params.num_vertices == 0) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
     switch (regs.draw.topology) {
-    case Maxwell::PrimitiveTopology::Quads:
-        if (params.is_indexed) {
-            UNIMPLEMENTED();
-        } else {
+    case Maxwell::PrimitiveTopology::Quads: {
+        if (!params.is_indexed) {
             const auto [buffer, offset] =
                 quad_array_pass.Assemble(params.num_vertices, params.base_vertex);
             buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
             params.base_vertex = 0;
             params.num_vertices = params.num_vertices * 6 / 4;
             params.is_indexed = true;
+            break;
         }
+        const GPUVAddr gpu_addr = regs.index_array.IndexStart();
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
+        std::tie(buffer, offset) = quad_indexed_pass.Assemble(
+            regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
+
+        buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
+        params.num_vertices = (params.num_vertices / 4) * 6;
+        params.base_vertex = 0;
         break;
+    }
     default: {
         if (!is_indexed) {
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
 
         auto format = regs.index_array.format;
         const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
         if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) {
-            std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, *buffer, offset);
+            std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset);
             format = Maxwell::IndexFormat::UnsignedShort;
         }
 
@@ -882,8 +1073,7 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
 
 void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
-    const auto& gpu = system.GPU().Maxwell3D();
-    const auto& shader_stage = gpu.state.shader_stages[stage];
+    const auto& shader_stage = maxwell3d.state.shader_stages[stage];
     for (const auto& entry : entries.const_buffers) {
         SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]);
     }
@@ -891,8 +1081,7 @@ void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, s
 
 void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
-    auto& gpu{system.GPU()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage]};
+    const auto& cbufs{maxwell3d.state.shader_stages[stage]};
 
     for (const auto& entry : entries.global_buffers) {
         const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset();
@@ -900,38 +1089,43 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
     }
 }
 
-void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
+void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().Maxwell3D();
-    for (const auto& entry : entries.texel_buffers) {
-        const auto image = GetTextureInfo(gpu, entry, stage).tic;
-        SetupTexelBuffer(image, entry);
+    for (const auto& entry : entries.uniform_texels) {
+        const auto image = GetTextureInfo(maxwell3d, entry, stage).tic;
+        SetupUniformTexels(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.samplers) {
-        for (std::size_t i = 0; i < entry.Size(); ++i) {
-            const auto texture = GetTextureInfo(gpu, entry, stage, i);
+        for (std::size_t i = 0; i < entry.size; ++i) {
+            const auto texture = GetTextureInfo(maxwell3d, entry, stage, i);
             SetupTexture(texture, entry);
         }
     }
 }
 
+void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(maxwell3d, entry, stage).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.images) {
-        const auto tic = GetTextureInfo(gpu, entry, stage).tic;
+        const auto tic = GetTextureInfo(maxwell3d, entry, stage).tic;
         SetupImage(tic, entry);
     }
 }
 
 void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& launch_desc = kepler_compute.launch_description;
     for (const auto& entry : entries.const_buffers) {
         const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
         const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
@@ -945,38 +1139,43 @@ void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
 
 void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
-    const auto cbufs{system.GPU().KeplerCompute().launch_description.const_buffer_config};
+    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
     for (const auto& entry : entries.global_buffers) {
         const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
         SetupGlobalBuffer(entry, addr);
     }
 }
 
-void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
+void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().KeplerCompute();
-    for (const auto& entry : entries.texel_buffers) {
-        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
-        SetupTexelBuffer(image, entry);
+    for (const auto& entry : entries.uniform_texels) {
+        const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
+        SetupUniformTexels(image, entry);
     }
 }
 
 void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.samplers) {
-        for (std::size_t i = 0; i < entry.Size(); ++i) {
-            const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i);
+        for (std::size_t i = 0; i < entry.size; ++i) {
+            const auto texture = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex, i);
             SetupTexture(texture, entry);
         }
     }
 }
 
+void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.images) {
-        const auto tic = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        const auto tic = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic;
         SetupImage(tic, entry);
     }
 }
@@ -985,8 +1184,7 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
                                         const Tegra::Engines::ConstBufferInfo& buffer) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                                          sizeof(float));
+        update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE);
         return;
     }
 
@@ -995,33 +1193,33 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
         Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
     ASSERT(size <= MaxConstbufferSize);
 
-    const auto [buffer_handle, offset] =
+    const auto info =
         buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
-    update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
-    auto& memory_manager{system.GPU().MemoryManager()};
-    const auto actual_addr = memory_manager.Read<u64>(address);
-    const auto size = memory_manager.Read<u32>(address + 8);
+    const u64 actual_addr = gpu_memory.Read<u64>(address);
+    const u32 size = gpu_memory.Read<u32>(address + 8);
 
     if (size == 0) {
-        // Sometimes global memory pointers don't have a proper size. Upload a dummy entry because
-        // Vulkan doesn't like empty buffers.
-        constexpr std::size_t dummy_size = 4;
-        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
-        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+        // Sometimes global memory pointers don't have a proper size. Upload a dummy entry
+        // because Vulkan doesn't like empty buffers.
+        // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
+        // default buffer.
+        static constexpr std::size_t dummy_size = 4;
+        const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
         return;
     }
 
-    const auto [buffer, offset] = buffer_cache.UploadMemory(
+    const auto info = buffer_cache.UploadMemory(
         actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
-    update_descriptor_queue.AddBuffer(buffer, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
-void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
-                                        const TexelBufferEntry& entry) {
+void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
+                                          const UniformTexelEntry& entry) {
     const auto view = texture_cache.GetTextureSurface(tic, entry);
     ASSERT(view->IsBufferView());
 
@@ -1033,29 +1231,38 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
     auto view = texture_cache.GetTextureSurface(texture.tic, entry);
     ASSERT(!view->IsBufferView());
 
-    const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source,
-                                            texture.tic.z_source, texture.tic.w_source);
+    const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
+                                                      texture.tic.z_source, texture.tic.w_source);
     const auto sampler = sampler_cache.GetSampler(texture.tsc);
     update_descriptor_queue.AddSampledImage(sampler, image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     sampled_views.push_back(ImageView{std::move(view), image_layout});
 }
 
+void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
+                                         const StorageTexelEntry& entry) {
+    const auto view = texture_cache.GetImageSurface(tic, entry);
+    ASSERT(view->IsBufferView());
+
+    update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
+}
+
 void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
     auto view = texture_cache.GetImageSurface(tic, entry);
 
-    if (entry.IsWritten()) {
+    if (entry.is_written) {
         view->MarkAsModified(texture_cache.Tick());
     }
 
     UNIMPLEMENTED_IF(tic.IsBuffer());
 
-    const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    const VkImageView image_view =
+        view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
     update_descriptor_queue.AddImage(image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_GENERAL;
     image_views.push_back(ImageView{std::move(view), image_layout});
 }
@@ -1150,6 +1357,107 @@ void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs)
     }
 }
 
+void RasterizerVulkan::UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchCullMode()) {
+        return;
+    }
+    scheduler.Record(
+        [enabled = regs.cull_test_enabled, cull_face = regs.cull_face](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetCullModeEXT(enabled ? MaxwellToVK::CullFace(cull_face) : VK_CULL_MODE_NONE);
+        });
+}
+
+void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthBoundsTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthBoundsTestEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_test_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthTestEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthWriteEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_write_enabled](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthWriteEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthCompareOp()) {
+        return;
+    }
+    scheduler.Record([func = regs.depth_test_func](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthCompareOpEXT(MaxwellToVK::ComparisonOp(func));
+    });
+}
+
+void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchFrontFace()) {
+        return;
+    }
+
+    VkFrontFace front_face = MaxwellToVK::FrontFace(regs.front_face);
+    if (regs.screen_y_control.triangle_rast_flip != 0) {
+        front_face = front_face == VK_FRONT_FACE_CLOCKWISE ? VK_FRONT_FACE_COUNTER_CLOCKWISE
+                                                           : VK_FRONT_FACE_CLOCKWISE;
+    }
+    scheduler.Record(
+        [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); });
+}
+
+void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchStencilOp()) {
+        return;
+    }
+    const Maxwell::StencilOp fail = regs.stencil_front_op_fail;
+    const Maxwell::StencilOp zfail = regs.stencil_front_op_zfail;
+    const Maxwell::StencilOp zpass = regs.stencil_front_op_zpass;
+    const Maxwell::ComparisonOp compare = regs.stencil_front_func_func;
+    if (regs.stencil_two_side_enable) {
+        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+        });
+    } else {
+        const Maxwell::StencilOp back_fail = regs.stencil_back_op_fail;
+        const Maxwell::StencilOp back_zfail = regs.stencil_back_op_zfail;
+        const Maxwell::StencilOp back_zpass = regs.stencil_back_op_zpass;
+        const Maxwell::ComparisonOp back_compare = regs.stencil_back_func_func;
+        scheduler.Record([fail, zfail, zpass, compare, back_fail, back_zfail, back_zpass,
+                          back_compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_BIT, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_BACK_BIT, MaxwellToVK::StencilOp(back_fail),
+                                   MaxwellToVK::StencilOp(back_zpass),
+                                   MaxwellToVK::StencilOp(back_zfail),
+                                   MaxwellToVK::ComparisonOp(back_compare));
+        });
+    }
+}
+
+void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchStencilTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.stencil_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetStencilTestEnableEXT(enable);
+    });
+}
+
 std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const {
     std::size_t size = CalculateVertexArraysSize();
     if (is_indexed) {
@@ -1165,7 +1473,7 @@ std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const {
 }
 
 std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& regs = maxwell3d.regs;
 
     std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -1174,15 +1482,14 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
         const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
         DEBUG_ASSERT(end >= start);
 
-        size += (end - start + 1) * regs.vertex_array[index].enable;
+        size += (end - start) * regs.vertex_array[index].enable;
     }
     return size;
 }
 
 std::size_t RasterizerVulkan::CalculateIndexBufferSize() const {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    return static_cast<std::size_t>(regs.index_array.count) *
-           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+    return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
+           static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
 }
 
 std::size_t RasterizerVulkan::CalculateConstBufferSize(
@@ -1197,28 +1504,54 @@ std::size_t RasterizerVulkan::CalculateConstBufferSize(
 }
 
 RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const {
-    using namespace VideoCore::Surface;
+    const auto& regs = maxwell3d.regs;
+    const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count);
 
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    RenderPassParams renderpass_params;
+    RenderPassParams params;
+    params.color_formats = {};
+    std::size_t color_texceptions = 0;
 
-    for (std::size_t rt = 0; rt < static_cast<std::size_t>(regs.rt_control.count); ++rt) {
+    std::size_t index = 0;
+    for (std::size_t rt = 0; rt < num_attachments; ++rt) {
         const auto& rendertarget = regs.rt[rt];
         if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) {
             continue;
         }
-        renderpass_params.color_attachments.push_back(RenderPassParams::ColorAttachment{
-            static_cast<u32>(rt), PixelFormatFromRenderTargetFormat(rendertarget.format),
-            texceptions[rt]});
+        params.color_formats[index] = static_cast<u8>(rendertarget.format);
+        color_texceptions |= (texceptions[rt] ? 1ULL : 0ULL) << index;
+        ++index;
     }
+    params.num_color_attachments = static_cast<u8>(index);
+    params.texceptions = static_cast<u8>(color_texceptions);
 
-    renderpass_params.has_zeta = regs.zeta_enable;
-    if (renderpass_params.has_zeta) {
-        renderpass_params.zeta_pixel_format = PixelFormatFromDepthFormat(regs.zeta.format);
-        renderpass_params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX];
-    }
+    params.zeta_format = regs.zeta_enable ? static_cast<u8>(regs.zeta.format) : 0;
+    params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX];
+    return params;
+}
+
+VkBuffer RasterizerVulkan::DefaultBuffer() {
+    if (default_buffer) {
+        return *default_buffer;
+    }
+
+    default_buffer = device.GetLogical().CreateBuffer({
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = DEFAULT_BUFFER_SIZE,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+                 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
+    default_buffer_commit = memory_manager.Commit(default_buffer, false);
 
-    return renderpass_params;
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0);
+    });
+    return *default_buffer;
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 46037860a..237e51fa4 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -14,24 +14,24 @@
 #include <boost/functional/hash.hpp>
 
 #include "common/common_types.h"
-#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_accelerated.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
+#include "video_core/renderer_vulkan/vk_fence_manager.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader/async_shaders.h"
 
 namespace Core {
 class System;
@@ -105,10 +105,11 @@ struct ImageView {
 
 class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
-    explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
+    explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+                              Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
                               VKScreenInfo& screen_info, const VKDevice& device,
-                              VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
-                              StateTracker& state_tracker, VKScheduler& scheduler);
+                              VKMemoryManager& memory_manager, StateTracker& state_tracker,
+                              VKScheduler& scheduler);
     ~RasterizerVulkan() override;
 
     void Draw(bool is_indexed, bool is_instanced) override;
@@ -118,8 +119,15 @@ public:
     void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(VAddr addr, u64 size) override;
+    bool MustFlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
+    void OnCPUWrite(VAddr addr, u64 size) override;
+    void SyncGuestHost() override;
+    void SignalSemaphore(GPUVAddr addr, u32 value) override;
+    void SignalSyncPoint(u32 value) override;
+    void ReleaseFences() override;
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void WaitForIdle() override;
     void FlushCommands() override;
     void TickFrame() override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@@ -127,7 +135,14 @@ public:
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
-    void SetupDirtyFlags() override;
+
+    VideoCommon::Shader::AsyncShaders& GetAsyncShaders() {
+        return async_shaders;
+    }
+
+    const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const {
+        return async_shaders;
+    }
 
     /// Maximum supported size that a constbuffer can have in bytes.
     static constexpr std::size_t MaxConstbufferSize = 0x10000;
@@ -148,10 +163,14 @@ private:
     using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>;
 
     static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;
+    static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
 
     void FlushWork();
 
-    Texceptions UpdateAttachments();
+    /// @brief Updates the currently bound attachments
+    /// @param is_clear True when the framebuffer is updated as a clear
+    /// @return Bitfield of attachments being used as sampled textures
+    Texceptions UpdateAttachments(bool is_clear);
 
     std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
 
@@ -160,7 +179,7 @@ private:
                                  bool is_indexed, bool is_instanced);
 
     /// Setup descriptors in the graphics pipeline.
-    void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders);
+    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
 
     void SetupImageTransitions(Texceptions texceptions,
                                const std::array<View, Maxwell::NumRenderTargets>& color_attachments,
@@ -174,8 +193,7 @@ private:
 
     bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment);
 
-    void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
-                           BufferBindings& buffer_bindings);
+    void SetupVertexArrays(BufferBindings& buffer_bindings);
 
     void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
 
@@ -185,12 +203,15 @@ private:
     /// Setup global buffers in the graphics pipeline.
     void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
 
-    /// Setup texel buffers in the graphics pipeline.
-    void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
+    /// Setup uniform texels in the graphics pipeline.
+    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
 
     /// Setup textures in the graphics pipeline.
     void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
 
+    /// Setup storage texels in the graphics pipeline.
+    void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
+
     /// Setup images in the graphics pipeline.
     void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
 
@@ -201,11 +222,14 @@ private:
     void SetupComputeGlobalBuffers(const ShaderEntries& entries);
 
     /// Setup texel buffers in the compute pipeline.
-    void SetupComputeTexelBuffers(const ShaderEntries& entries);
+    void SetupComputeUniformTexels(const ShaderEntries& entries);
 
     /// Setup textures in the compute pipeline.
     void SetupComputeTextures(const ShaderEntries& entries);
 
+    /// Setup storage texels in the compute pipeline.
+    void SetupComputeStorageTexels(const ShaderEntries& entries);
+
     /// Setup images in the compute pipeline.
     void SetupComputeImages(const ShaderEntries& entries);
 
@@ -214,10 +238,12 @@ private:
 
     void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
 
-    void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
+    void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
 
     void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
 
+    void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
+
     void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
 
     void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -227,6 +253,15 @@ private:
     void UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs);
 
+    void UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+
     std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
 
     std::size_t CalculateComputeStreamBufferSize() const;
@@ -240,11 +275,15 @@ private:
 
     RenderPassParams GetRenderPassParams(Texceptions texceptions) const;
 
-    Core::System& system;
-    Core::Frontend::EmuWindow& render_window;
+    VkBuffer DefaultBuffer();
+
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+
     VKScreenInfo& screen_info;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     StateTracker& state_tracker;
     VKScheduler& scheduler;
@@ -254,6 +293,7 @@ private:
     VKUpdateDescriptorQueue update_descriptor_queue;
     VKRenderPassCache renderpass_cache;
     QuadArrayPass quad_array_pass;
+    QuadIndexedPass quad_indexed_pass;
     Uint8Pass uint8_pass;
 
     VKTextureCache texture_cache;
@@ -261,6 +301,12 @@ private:
     VKBufferCache buffer_cache;
     VKSamplerCache sampler_cache;
     VKQueryCache query_cache;
+    VKFenceManager fence_manager;
+
+    vk::Buffer default_buffer;
+    VKMemoryCommit default_buffer_commit;
+    vk::Event wfi_event;
+    VideoCommon::Shader::AsyncShaders async_shaders;
 
     std::array<View, Maxwell::NumRenderTargets> color_attachments;
     View zeta_attachment;
diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp
index 4e5286a69..80284cf92 100644
--- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp
@@ -2,9 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <cstring>
 #include <memory>
 #include <vector>
 
+#include "common/cityhash.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
 #include "video_core/renderer_vulkan/vk_device.h"
@@ -13,6 +15,15 @@
 
 namespace Vulkan {
 
+std::size_t RenderPassParams::Hash() const noexcept {
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this);
+    return static_cast<std::size_t>(hash);
+}
+
+bool RenderPassParams::operator==(const RenderPassParams& rhs) const noexcept {
+    return std::memcmp(&rhs, this, sizeof *this) == 0;
+}
+
 VKRenderPassCache::VKRenderPassCache(const VKDevice& device) : device{device} {}
 
 VKRenderPassCache::~VKRenderPassCache() = default;
@@ -27,72 +38,86 @@ VkRenderPass VKRenderPassCache::GetRenderPass(const RenderPassParams& params) {
 }
 
 vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& params) const {
+    using namespace VideoCore::Surface;
+    const std::size_t num_attachments = static_cast<std::size_t>(params.num_color_attachments);
+
     std::vector<VkAttachmentDescription> descriptors;
+    descriptors.reserve(num_attachments);
+
     std::vector<VkAttachmentReference> color_references;
+    color_references.reserve(num_attachments);
 
-    for (std::size_t rt = 0; rt < params.color_attachments.size(); ++rt) {
-        const auto attachment = params.color_attachments[rt];
-        const auto format =
-            MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, attachment.pixel_format);
+    for (std::size_t rt = 0; rt < num_attachments; ++rt) {
+        const auto guest_format = static_cast<Tegra::RenderTargetFormat>(params.color_formats[rt]);
+        const PixelFormat pixel_format = PixelFormatFromRenderTargetFormat(guest_format);
+        const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format);
         ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}",
-                   static_cast<u32>(attachment.pixel_format));
-
-        // TODO(Rodrigo): Add eMayAlias when it's needed.
-        const auto color_layout = attachment.is_texception
-                                      ? VK_IMAGE_LAYOUT_GENERAL
-                                      : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-        VkAttachmentDescription& descriptor = descriptors.emplace_back();
-        descriptor.flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT;
-        descriptor.format = format.format;
-        descriptor.samples = VK_SAMPLE_COUNT_1_BIT;
-        descriptor.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
-        descriptor.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
-        descriptor.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-        descriptor.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-        descriptor.initialLayout = color_layout;
-        descriptor.finalLayout = color_layout;
-
-        VkAttachmentReference& reference = color_references.emplace_back();
-        reference.attachment = static_cast<u32>(rt);
-        reference.layout = color_layout;
+                   static_cast<int>(pixel_format));
+
+        // TODO(Rodrigo): Add MAY_ALIAS_BIT when it's needed.
+        const VkImageLayout color_layout = ((params.texceptions >> rt) & 1) != 0
+                                               ? VK_IMAGE_LAYOUT_GENERAL
+                                               : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+        descriptors.push_back({
+            .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT,
+            .format = format.format,
+            .samples = VK_SAMPLE_COUNT_1_BIT,
+            .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+            .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+            .initialLayout = color_layout,
+            .finalLayout = color_layout,
+        });
+
+        color_references.push_back({
+            .attachment = static_cast<u32>(rt),
+            .layout = color_layout,
+        });
     }
 
     VkAttachmentReference zeta_attachment_ref;
-    if (params.has_zeta) {
-        const auto format =
-            MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.zeta_pixel_format);
+    const bool has_zeta = params.zeta_format != 0;
+    if (has_zeta) {
+        const auto guest_format = static_cast<Tegra::DepthFormat>(params.zeta_format);
+        const PixelFormat pixel_format = PixelFormatFromDepthFormat(guest_format);
+        const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format);
         ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}",
-                   static_cast<u32>(params.zeta_pixel_format));
-
-        const auto zeta_layout = params.zeta_texception
-                                     ? VK_IMAGE_LAYOUT_GENERAL
-                                     : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-        VkAttachmentDescription& descriptor = descriptors.emplace_back();
-        descriptor.flags = 0;
-        descriptor.format = format.format;
-        descriptor.samples = VK_SAMPLE_COUNT_1_BIT;
-        descriptor.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
-        descriptor.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
-        descriptor.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
-        descriptor.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
-        descriptor.initialLayout = zeta_layout;
-        descriptor.finalLayout = zeta_layout;
-
-        zeta_attachment_ref.attachment = static_cast<u32>(params.color_attachments.size());
-        zeta_attachment_ref.layout = zeta_layout;
+                   static_cast<int>(pixel_format));
+
+        const VkImageLayout zeta_layout = params.zeta_texception != 0
+                                              ? VK_IMAGE_LAYOUT_GENERAL
+                                              : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+        descriptors.push_back({
+            .flags = 0,
+            .format = format.format,
+            .samples = VK_SAMPLE_COUNT_1_BIT,
+            .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+            .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .initialLayout = zeta_layout,
+            .finalLayout = zeta_layout,
+        });
+
+        zeta_attachment_ref = {
+            .attachment = static_cast<u32>(num_attachments),
+            .layout = zeta_layout,
+        };
     }
 
-    VkSubpassDescription subpass_description;
-    subpass_description.flags = 0;
-    subpass_description.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-    subpass_description.inputAttachmentCount = 0;
-    subpass_description.pInputAttachments = nullptr;
-    subpass_description.colorAttachmentCount = static_cast<u32>(color_references.size());
-    subpass_description.pColorAttachments = color_references.data();
-    subpass_description.pResolveAttachments = nullptr;
-    subpass_description.pDepthStencilAttachment = params.has_zeta ? &zeta_attachment_ref : nullptr;
-    subpass_description.preserveAttachmentCount = 0;
-    subpass_description.pPreserveAttachments = nullptr;
+    const VkSubpassDescription subpass_description{
+        .flags = 0,
+        .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        .inputAttachmentCount = 0,
+        .pInputAttachments = nullptr,
+        .colorAttachmentCount = static_cast<u32>(color_references.size()),
+        .pColorAttachments = color_references.data(),
+        .pResolveAttachments = nullptr,
+        .pDepthStencilAttachment = has_zeta ? &zeta_attachment_ref : nullptr,
+        .preserveAttachmentCount = 0,
+        .pPreserveAttachments = nullptr,
+    };
 
     VkAccessFlags access = 0;
     VkPipelineStageFlags stage = 0;
@@ -101,32 +126,33 @@ vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& param
         stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
     }
 
-    if (params.has_zeta) {
+    if (has_zeta) {
         access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
                   VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
         stage |= VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
     }
 
-    VkSubpassDependency subpass_dependency;
-    subpass_dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
-    subpass_dependency.dstSubpass = 0;
-    subpass_dependency.srcStageMask = stage;
-    subpass_dependency.dstStageMask = stage;
-    subpass_dependency.srcAccessMask = 0;
-    subpass_dependency.dstAccessMask = access;
-    subpass_dependency.dependencyFlags = 0;
-
-    VkRenderPassCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.attachmentCount = static_cast<u32>(descriptors.size());
-    ci.pAttachments = descriptors.data();
-    ci.subpassCount = 1;
-    ci.pSubpasses = &subpass_description;
-    ci.dependencyCount = 1;
-    ci.pDependencies = &subpass_dependency;
-    return device.GetLogical().CreateRenderPass(ci);
+    const VkSubpassDependency subpass_dependency{
+        .srcSubpass = VK_SUBPASS_EXTERNAL,
+        .dstSubpass = 0,
+        .srcStageMask = stage,
+        .dstStageMask = stage,
+        .srcAccessMask = 0,
+        .dstAccessMask = access,
+        .dependencyFlags = 0,
+    };
+
+    return device.GetLogical().CreateRenderPass({
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .attachmentCount = static_cast<u32>(descriptors.size()),
+        .pAttachments = descriptors.data(),
+        .subpassCount = 1,
+        .pSubpasses = &subpass_description,
+        .dependencyCount = 1,
+        .pDependencies = &subpass_dependency,
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.h b/src/video_core/renderer_vulkan/vk_renderpass_cache.h
index 921b6efb5..8b0fec720 100644
--- a/src/video_core/renderer_vulkan/vk_renderpass_cache.h
+++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.h
@@ -4,8 +4,7 @@
 
 #pragma once
 
-#include <memory>
-#include <tuple>
+#include <type_traits>
 #include <unordered_map>
 
 #include <boost/container/static_vector.hpp>
@@ -19,51 +18,25 @@ namespace Vulkan {
 
 class VKDevice;
 
-// TODO(Rodrigo): Optimize this structure for faster hashing
-
 struct RenderPassParams {
-    struct ColorAttachment {
-        u32 index = 0;
-        VideoCore::Surface::PixelFormat pixel_format = VideoCore::Surface::PixelFormat::Invalid;
-        bool is_texception = false;
-
-        std::size_t Hash() const noexcept {
-            return static_cast<std::size_t>(pixel_format) |
-                   static_cast<std::size_t>(is_texception) << 6 |
-                   static_cast<std::size_t>(index) << 7;
-        }
-
-        bool operator==(const ColorAttachment& rhs) const noexcept {
-            return std::tie(index, pixel_format, is_texception) ==
-                   std::tie(rhs.index, rhs.pixel_format, rhs.is_texception);
-        }
-    };
-
-    boost::container::static_vector<ColorAttachment,
-                                    Tegra::Engines::Maxwell3D::Regs::NumRenderTargets>
-        color_attachments{};
-    // TODO(Rodrigo): Unify has_zeta into zeta_pixel_format and zeta_component_type.
-    VideoCore::Surface::PixelFormat zeta_pixel_format = VideoCore::Surface::PixelFormat::Invalid;
-    bool has_zeta = false;
-    bool zeta_texception = false;
-
-    std::size_t Hash() const noexcept {
-        std::size_t hash = 0;
-        for (const auto& rt : color_attachments) {
-            boost::hash_combine(hash, rt.Hash());
-        }
-        boost::hash_combine(hash, zeta_pixel_format);
-        boost::hash_combine(hash, has_zeta);
-        boost::hash_combine(hash, zeta_texception);
-        return hash;
-    }
+    std::array<u8, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> color_formats;
+    u8 num_color_attachments;
+    u8 texceptions;
+
+    u8 zeta_format;
+    u8 zeta_texception;
+
+    std::size_t Hash() const noexcept;
+
+    bool operator==(const RenderPassParams& rhs) const noexcept;
 
-    bool operator==(const RenderPassParams& rhs) const {
-        return std::tie(color_attachments, zeta_pixel_format, has_zeta, zeta_texception) ==
-               std::tie(rhs.color_attachments, rhs.zeta_pixel_format, rhs.has_zeta,
-                        rhs.zeta_texception);
+    bool operator!=(const RenderPassParams& rhs) const noexcept {
+        return !operator==(rhs);
     }
 };
+static_assert(std::has_unique_object_representations_v<RenderPassParams>);
+static_assert(std::is_trivially_copyable_v<RenderPassParams>);
+static_assert(std::is_trivially_constructible_v<RenderPassParams>);
 
 } // namespace Vulkan
 
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp
deleted file mode 100644
index dc06f545a..000000000
--- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <optional>
-#include "common/assert.h"
-#include "common/logging/log.h"
-#include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
-#include "video_core/renderer_vulkan/wrapper.h"
-
-namespace Vulkan {
-
-namespace {
-
-// TODO(Rodrigo): Fine tune these numbers.
-constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 0x1000;
-constexpr std::size_t FENCES_GROW_STEP = 0x40;
-
-VkFenceCreateInfo BuildFenceCreateInfo() {
-    VkFenceCreateInfo fence_ci;
-    fence_ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-    fence_ci.pNext = nullptr;
-    fence_ci.flags = 0;
-    return fence_ci;
-}
-
-} // Anonymous namespace
-
-class CommandBufferPool final : public VKFencedPool {
-public:
-    CommandBufferPool(const VKDevice& device)
-        : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {}
-
-    void Allocate(std::size_t begin, std::size_t end) override {
-        // Command buffers are going to be commited, recorded, executed every single usage cycle.
-        // They are also going to be reseted when commited.
-        VkCommandPoolCreateInfo command_pool_ci;
-        command_pool_ci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
-        command_pool_ci.pNext = nullptr;
-        command_pool_ci.flags =
-            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
-        command_pool_ci.queueFamilyIndex = device.GetGraphicsFamily();
-
-        Pool& pool = pools.emplace_back();
-        pool.handle = device.GetLogical().CreateCommandPool(command_pool_ci);
-        pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE);
-    }
-
-    VkCommandBuffer Commit(VKFence& fence) {
-        const std::size_t index = CommitResource(fence);
-        const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE;
-        const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE;
-        return pools[pool_index].cmdbufs[sub_index];
-    }
-
-private:
-    struct Pool {
-        vk::CommandPool handle;
-        vk::CommandBuffers cmdbufs;
-    };
-
-    const VKDevice& device;
-    std::vector<Pool> pools;
-};
-
-VKResource::VKResource() = default;
-
-VKResource::~VKResource() = default;
-
-VKFence::VKFence(const VKDevice& device)
-    : device{device}, handle{device.GetLogical().CreateFence(BuildFenceCreateInfo())} {}
-
-VKFence::~VKFence() = default;
-
-void VKFence::Wait() {
-    switch (const VkResult result = handle.Wait()) {
-    case VK_SUCCESS:
-        return;
-    case VK_ERROR_DEVICE_LOST:
-        device.ReportLoss();
-        [[fallthrough]];
-    default:
-        throw vk::Exception(result);
-    }
-}
-
-void VKFence::Release() {
-    ASSERT(is_owned);
-    is_owned = false;
-}
-
-void VKFence::Commit() {
-    is_owned = true;
-    is_used = true;
-}
-
-bool VKFence::Tick(bool gpu_wait, bool owner_wait) {
-    if (!is_used) {
-        // If a fence is not used it's always free.
-        return true;
-    }
-    if (is_owned && !owner_wait) {
-        // The fence is still being owned (Release has not been called) and ownership wait has
-        // not been asked.
-        return false;
-    }
-
-    if (gpu_wait) {
-        // Wait for the fence if it has been requested.
-        (void)handle.Wait();
-    } else {
-        if (handle.GetStatus() != VK_SUCCESS) {
-            // Vulkan fence is not ready, not much it can do here
-            return false;
-        }
-    }
-
-    // Broadcast resources their free state.
-    for (auto* resource : protected_resources) {
-        resource->OnFenceRemoval(this);
-    }
-    protected_resources.clear();
-
-    // Prepare fence for reusage.
-    handle.Reset();
-    is_used = false;
-    return true;
-}
-
-void VKFence::Protect(VKResource* resource) {
-    protected_resources.push_back(resource);
-}
-
-void VKFence::Unprotect(VKResource* resource) {
-    const auto it = std::find(protected_resources.begin(), protected_resources.end(), resource);
-    ASSERT(it != protected_resources.end());
-
-    resource->OnFenceRemoval(this);
-    protected_resources.erase(it);
-}
-
-void VKFence::RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept {
-    std::replace(std::begin(protected_resources), std::end(protected_resources), old_resource,
-                 new_resource);
-}
-
-VKFenceWatch::VKFenceWatch() = default;
-
-VKFenceWatch::VKFenceWatch(VKFence& initial_fence) {
-    Watch(initial_fence);
-}
-
-VKFenceWatch::VKFenceWatch(VKFenceWatch&& rhs) noexcept {
-    fence = std::exchange(rhs.fence, nullptr);
-    if (fence) {
-        fence->RedirectProtection(&rhs, this);
-    }
-}
-
-VKFenceWatch& VKFenceWatch::operator=(VKFenceWatch&& rhs) noexcept {
-    fence = std::exchange(rhs.fence, nullptr);
-    if (fence) {
-        fence->RedirectProtection(&rhs, this);
-    }
-    return *this;
-}
-
-VKFenceWatch::~VKFenceWatch() {
-    if (fence) {
-        fence->Unprotect(this);
-    }
-}
-
-void VKFenceWatch::Wait() {
-    if (fence == nullptr) {
-        return;
-    }
-    fence->Wait();
-    fence->Unprotect(this);
-}
-
-void VKFenceWatch::Watch(VKFence& new_fence) {
-    Wait();
-    fence = &new_fence;
-    fence->Protect(this);
-}
-
-bool VKFenceWatch::TryWatch(VKFence& new_fence) {
-    if (fence) {
-        return false;
-    }
-    fence = &new_fence;
-    fence->Protect(this);
-    return true;
-}
-
-void VKFenceWatch::OnFenceRemoval(VKFence* signaling_fence) {
-    ASSERT_MSG(signaling_fence == fence, "Removing the wrong fence");
-    fence = nullptr;
-}
-
-VKFencedPool::VKFencedPool(std::size_t grow_step) : grow_step{grow_step} {}
-
-VKFencedPool::~VKFencedPool() = default;
-
-std::size_t VKFencedPool::CommitResource(VKFence& fence) {
-    const auto Search = [&](std::size_t begin, std::size_t end) -> std::optional<std::size_t> {
-        for (std::size_t iterator = begin; iterator < end; ++iterator) {
-            if (watches[iterator]->TryWatch(fence)) {
-                // The resource is now being watched, a free resource was successfully found.
-                return iterator;
-            }
-        }
-        return {};
-    };
-    // Try to find a free resource from the hinted position to the end.
-    auto found = Search(free_iterator, watches.size());
-    if (!found) {
-        // Search from beginning to the hinted position.
-        found = Search(0, free_iterator);
-        if (!found) {
-            // Both searches failed, the pool is full; handle it.
-            const std::size_t free_resource = ManageOverflow();
-
-            // Watch will wait for the resource to be free.
-            watches[free_resource]->Watch(fence);
-            found = free_resource;
-        }
-    }
-    // Free iterator is hinted to the resource after the one that's been commited.
-    free_iterator = (*found + 1) % watches.size();
-    return *found;
-}
-
-std::size_t VKFencedPool::ManageOverflow() {
-    const std::size_t old_capacity = watches.size();
-    Grow();
-
-    // The last entry is guaranted to be free, since it's the first element of the freshly
-    // allocated resources.
-    return old_capacity;
-}
-
-void VKFencedPool::Grow() {
-    const std::size_t old_capacity = watches.size();
-    watches.resize(old_capacity + grow_step);
-    std::generate(watches.begin() + old_capacity, watches.end(),
-                  []() { return std::make_unique<VKFenceWatch>(); });
-    Allocate(old_capacity, old_capacity + grow_step);
-}
-
-VKResourceManager::VKResourceManager(const VKDevice& device) : device{device} {
-    GrowFences(FENCES_GROW_STEP);
-    command_buffer_pool = std::make_unique<CommandBufferPool>(device);
-}
-
-VKResourceManager::~VKResourceManager() = default;
-
-VKFence& VKResourceManager::CommitFence() {
-    const auto StepFences = [&](bool gpu_wait, bool owner_wait) -> VKFence* {
-        const auto Tick = [=](auto& fence) { return fence->Tick(gpu_wait, owner_wait); };
-        const auto hinted = fences.begin() + fences_iterator;
-
-        auto it = std::find_if(hinted, fences.end(), Tick);
-        if (it == fences.end()) {
-            it = std::find_if(fences.begin(), hinted, Tick);
-            if (it == hinted) {
-                return nullptr;
-            }
-        }
-        fences_iterator = std::distance(fences.begin(), it) + 1;
-        if (fences_iterator >= fences.size())
-            fences_iterator = 0;
-
-        auto& fence = *it;
-        fence->Commit();
-        return fence.get();
-    };
-
-    VKFence* found_fence = StepFences(false, false);
-    if (!found_fence) {
-        // Try again, this time waiting.
-        found_fence = StepFences(true, false);
-
-        if (!found_fence) {
-            // Allocate new fences and try again.
-            LOG_INFO(Render_Vulkan, "Allocating new fences {} -> {}", fences.size(),
-                     fences.size() + FENCES_GROW_STEP);
-
-            GrowFences(FENCES_GROW_STEP);
-            found_fence = StepFences(true, false);
-            ASSERT(found_fence != nullptr);
-        }
-    }
-    return *found_fence;
-}
-
-VkCommandBuffer VKResourceManager::CommitCommandBuffer(VKFence& fence) {
-    return command_buffer_pool->Commit(fence);
-}
-
-void VKResourceManager::GrowFences(std::size_t new_fences_count) {
-    const std::size_t previous_size = fences.size();
-    fences.resize(previous_size + new_fences_count);
-
-    std::generate(fences.begin() + previous_size, fences.end(),
-                  [this] { return std::make_unique<VKFence>(device); });
-}
-
-} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h
deleted file mode 100644
index f683d2276..000000000
--- a/src/video_core/renderer_vulkan/vk_resource_manager.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-#include "video_core/renderer_vulkan/wrapper.h"
-
-namespace Vulkan {
-
-class VKDevice;
-class VKFence;
-class VKResourceManager;
-
-class CommandBufferPool;
-
-/// Interface for a Vulkan resource
-class VKResource {
-public:
-    explicit VKResource();
-    virtual ~VKResource();
-
-    /**
-     * Signals the object that an owning fence has been signaled.
-     * @param signaling_fence Fence that signals its usage end.
-     */
-    virtual void OnFenceRemoval(VKFence* signaling_fence) = 0;
-};
-
-/**
- * Fences take ownership of objects, protecting them from GPU-side or driver-side concurrent access.
- * They must be commited from the resource manager. Their usage flow is: commit the fence from the
- * resource manager, protect resources with it and use them, send the fence to an execution queue
- * and Wait for it if needed and then call Release. Used resources will automatically be signaled
- * when they are free to be reused.
- * @brief Protects resources for concurrent usage and signals its release.
- */
-class VKFence {
-    friend class VKResourceManager;
-
-public:
-    explicit VKFence(const VKDevice& device);
-    ~VKFence();
-
-    /**
-     * Waits for the fence to be signaled.
-     * @warning You must have ownership of the fence and it has to be previously sent to a queue to
-     * call this function.
-     */
-    void Wait();
-
-    /**
-     * Releases ownership of the fence. Pass after it has been sent to an execution queue.
-     * Unmanaged usage of the fence after the call will result in undefined behavior because it may
-     * be being used for something else.
-     */
-    void Release();
-
-    /// Protects a resource with this fence.
-    void Protect(VKResource* resource);
-
-    /// Removes protection for a resource.
-    void Unprotect(VKResource* resource);
-
-    /// Redirects one protected resource to a new address.
-    void RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept;
-
-    /// Retreives the fence.
-    operator VkFence() const {
-        return *handle;
-    }
-
-private:
-    /// Take ownership of the fence.
-    void Commit();
-
-    /**
-     * Updates the fence status.
-     * @warning Waiting for the owner might soft lock the execution.
-     * @param gpu_wait Wait for the fence to be signaled by the driver.
-     * @param owner_wait Wait for the owner to signal its freedom.
-     * @returns True if the fence is free. Waiting for gpu and owner will always return true.
-     */
-    bool Tick(bool gpu_wait, bool owner_wait);
-
-    const VKDevice& device;                       ///< Device handler
-    vk::Fence handle;                             ///< Vulkan fence
-    std::vector<VKResource*> protected_resources; ///< List of resources protected by this fence
-    bool is_owned = false; ///< The fence has been commited but not released yet.
-    bool is_used = false;  ///< The fence has been commited but it has not been checked to be free.
-};
-
-/**
- * A fence watch is used to keep track of the usage of a fence and protect a resource or set of
- * resources without having to inherit VKResource from their handlers.
- */
-class VKFenceWatch final : public VKResource {
-public:
-    explicit VKFenceWatch();
-    VKFenceWatch(VKFence& initial_fence);
-    VKFenceWatch(VKFenceWatch&&) noexcept;
-    VKFenceWatch(const VKFenceWatch&) = delete;
-    ~VKFenceWatch() override;
-
-    VKFenceWatch& operator=(VKFenceWatch&&) noexcept;
-
-    /// Waits for the fence to be released.
-    void Wait();
-
-    /**
-     * Waits for a previous fence and watches a new one.
-     * @param new_fence New fence to wait to.
-     */
-    void Watch(VKFence& new_fence);
-
-    /**
-     * Checks if it's currently being watched and starts watching it if it's available.
-     * @returns True if a watch has started, false if it's being watched.
-     */
-    bool TryWatch(VKFence& new_fence);
-
-    void OnFenceRemoval(VKFence* signaling_fence) override;
-
-    /**
-     * Do not use it paired with Watch. Use TryWatch instead.
-     * Returns true when the watch is free.
-     */
-    bool IsUsed() const {
-        return fence != nullptr;
-    }
-
-private:
-    VKFence* fence{}; ///< Fence watching this resource. nullptr when the watch is free.
-};
-
-/**
- * Handles a pool of resources protected by fences. Manages resource overflow allocating more
- * resources.
- */
-class VKFencedPool {
-public:
-    explicit VKFencedPool(std::size_t grow_step);
-    virtual ~VKFencedPool();
-
-protected:
-    /**
-     * Commits a free resource and protects it with a fence. It may allocate new resources.
-     * @param fence Fence that protects the commited resource.
-     * @returns Index of the resource commited.
-     */
-    std::size_t CommitResource(VKFence& fence);
-
-    /// Called when a chunk of resources have to be allocated.
-    virtual void Allocate(std::size_t begin, std::size_t end) = 0;
-
-private:
-    /// Manages pool overflow allocating new resources.
-    std::size_t ManageOverflow();
-
-    /// Allocates a new page of resources.
-    void Grow();
-
-    std::size_t grow_step = 0;     ///< Number of new resources created after an overflow
-    std::size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found
-    std::vector<std::unique_ptr<VKFenceWatch>> watches; ///< Set of watched resources
-};
-
-/**
- * The resource manager handles all resources that can be protected with a fence avoiding
- * driver-side or GPU-side concurrent usage. Usage is documented in VKFence.
- */
-class VKResourceManager final {
-public:
-    explicit VKResourceManager(const VKDevice& device);
-    ~VKResourceManager();
-
-    /// Commits a fence. It has to be sent to a queue and released.
-    VKFence& CommitFence();
-
-    /// Commits an unused command buffer and protects it with a fence.
-    VkCommandBuffer CommitCommandBuffer(VKFence& fence);
-
-private:
-    /// Allocates new fences.
-    void GrowFences(std::size_t new_fences_count);
-
-    const VKDevice& device;          ///< Device handler.
-    std::size_t fences_iterator = 0; ///< Index where a free fence is likely to be found.
-    std::vector<std::unique_ptr<VKFence>> fences;           ///< Pool of fences.
-    std::unique_ptr<CommandBufferPool> command_buffer_pool; ///< Pool of command buffers.
-};
-
-} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
new file mode 100644
index 000000000..ee274ac59
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
@@ -0,0 +1,63 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <optional>
+
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
+#include "video_core/renderer_vulkan/vk_resource_pool.h"
+
+namespace Vulkan {
+
+ResourcePool::ResourcePool(MasterSemaphore& master_semaphore_, size_t grow_step_)
+    : master_semaphore{master_semaphore_}, grow_step{grow_step_} {}
+
+ResourcePool::~ResourcePool() = default;
+
+size_t ResourcePool::CommitResource() {
+    // Refresh semaphore to query updated results
+    master_semaphore.Refresh();
+
+    const auto search = [this](size_t begin, size_t end) -> std::optional<size_t> {
+        for (size_t iterator = begin; iterator < end; ++iterator) {
+            if (master_semaphore.IsFree(ticks[iterator])) {
+                ticks[iterator] = master_semaphore.CurrentTick();
+                return iterator;
+            }
+        }
+        return {};
+    };
+    // Try to find a free resource from the hinted position to the end.
+    auto found = search(free_iterator, ticks.size());
+    if (!found) {
+        // Search from beginning to the hinted position.
+        found = search(0, free_iterator);
+        if (!found) {
+            // Both searches failed, the pool is full; handle it.
+            const size_t free_resource = ManageOverflow();
+
+            ticks[free_resource] = master_semaphore.CurrentTick();
+            found = free_resource;
+        }
+    }
+    // Free iterator is hinted to the resource after the one that's been commited.
+    free_iterator = (*found + 1) % ticks.size();
+    return *found;
+}
+
+size_t ResourcePool::ManageOverflow() {
+    const size_t old_capacity = ticks.size();
+    Grow();
+
+    // The last entry is guaranted to be free, since it's the first element of the freshly
+    // allocated resources.
+    return old_capacity;
+}
+
+void ResourcePool::Grow() {
+    const size_t old_capacity = ticks.size();
+    ticks.resize(old_capacity + grow_step);
+    Allocate(old_capacity, old_capacity + grow_step);
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h
new file mode 100644
index 000000000..a018c7ec2
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.h
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace Vulkan {
+
+class MasterSemaphore;
+
+/**
+ * Handles a pool of resources protected by fences. Manages resource overflow allocating more
+ * resources.
+ */
+class ResourcePool {
+public:
+    explicit ResourcePool(MasterSemaphore& master_semaphore, size_t grow_step);
+    virtual ~ResourcePool();
+
+protected:
+    size_t CommitResource();
+
+    /// Called when a chunk of resources have to be allocated.
+    virtual void Allocate(size_t begin, size_t end) = 0;
+
+private:
+    /// Manages pool overflow allocating new resources.
+    size_t ManageOverflow();
+
+    /// Allocates a new page of resources.
+    void Grow();
+
+    MasterSemaphore& master_semaphore;
+    size_t grow_step = 0;     ///< Number of new resources created after an overflow
+    size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found
+    std::vector<u64> ticks;   ///< Ticks for each resource
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index 07bbcf520..b068888f9 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -2,16 +2,15 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <cstring>
-#include <optional>
 #include <unordered_map>
 
-#include "common/assert.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/textures/texture.h"
 
+using Tegra::Texture::TextureMipmapFilter;
+
 namespace Vulkan {
 
 namespace {
@@ -42,26 +41,39 @@ VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {}
 VKSamplerCache::~VKSamplerCache() = default;
 
 vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const {
-    VkSamplerCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter);
-    ci.minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter);
-    ci.mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter);
-    ci.addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter);
-    ci.addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter);
-    ci.addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter);
-    ci.mipLodBias = tsc.GetLodBias();
-    ci.anisotropyEnable = tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE;
-    ci.maxAnisotropy = tsc.GetMaxAnisotropy();
-    ci.compareEnable = tsc.depth_compare_enabled;
-    ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func);
-    ci.minLod = tsc.GetMinLod();
-    ci.maxLod = tsc.GetMaxLod();
-    ci.borderColor = ConvertBorderColor(tsc.GetBorderColor());
-    ci.unnormalizedCoordinates = VK_FALSE;
-    return device.GetLogical().CreateSampler(ci);
+    const bool arbitrary_borders = device.IsExtCustomBorderColorSupported();
+    const std::array color = tsc.GetBorderColor();
+
+    VkSamplerCustomBorderColorCreateInfoEXT border{
+        .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .customBorderColor = {},
+        .format = VK_FORMAT_UNDEFINED,
+    };
+    std::memcpy(&border.customBorderColor, color.data(), sizeof(color));
+
+    return device.GetLogical().CreateSampler({
+        .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+        .pNext = arbitrary_borders ? &border : nullptr,
+        .flags = 0,
+        .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter),
+        .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter),
+        .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter),
+        .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter),
+        .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter),
+        .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter),
+        .mipLodBias = tsc.GetLodBias(),
+        .anisotropyEnable =
+            static_cast<VkBool32>(tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE),
+        .maxAnisotropy = tsc.GetMaxAnisotropy(),
+        .compareEnable = tsc.depth_compare_enabled,
+        .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func),
+        .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(),
+        .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(),
+        .borderColor =
+            arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color),
+        .unnormalizedCoordinates = VK_FALSE,
+    });
 }
 
 VkSampler VKSamplerCache::ToSamplerType(const vk::Sampler& sampler) const {
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 900f551b3..1a483dc71 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -8,11 +8,12 @@
 #include <thread>
 #include <utility>
 
-#include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
+#include "video_core/renderer_vulkan/vk_command_pool.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/wrapper.h"
@@ -35,10 +36,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) {
     last = nullptr;
 }
 
-VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager,
-                         StateTracker& state_tracker)
-    : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker},
-      next_fence{&resource_manager.CommitFence()} {
+VKScheduler::VKScheduler(const VKDevice& device_, StateTracker& state_tracker_)
+    : device{device_}, state_tracker{state_tracker_},
+      master_semaphore{std::make_unique<MasterSemaphore>(device)},
+      command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
     AcquireNewChunk();
     AllocateNewContext();
     worker_thread = std::thread(&VKScheduler::WorkerThread, this);
@@ -50,20 +51,27 @@ VKScheduler::~VKScheduler() {
     worker_thread.join();
 }
 
-void VKScheduler::Flush(bool release_fence, VkSemaphore semaphore) {
+u64 VKScheduler::CurrentTick() const noexcept {
+    return master_semaphore->CurrentTick();
+}
+
+bool VKScheduler::IsFree(u64 tick) const noexcept {
+    return master_semaphore->IsFree(tick);
+}
+
+void VKScheduler::Wait(u64 tick) {
+    master_semaphore->Wait(tick);
+}
+
+void VKScheduler::Flush(VkSemaphore semaphore) {
     SubmitExecution(semaphore);
-    if (release_fence) {
-        current_fence->Release();
-    }
     AllocateNewContext();
 }
 
-void VKScheduler::Finish(bool release_fence, VkSemaphore semaphore) {
+void VKScheduler::Finish(VkSemaphore semaphore) {
+    const u64 presubmit_tick = CurrentTick();
     SubmitExecution(semaphore);
-    current_fence->Wait();
-    if (release_fence) {
-        current_fence->Release();
-    }
+    Wait(presubmit_tick);
     AllocateNewContext();
 }
 
@@ -100,16 +108,19 @@ void VKScheduler::RequestRenderpass(VkRenderPass renderpass, VkFramebuffer frame
     state.framebuffer = framebuffer;
     state.render_area = render_area;
 
-    VkRenderPassBeginInfo renderpass_bi;
-    renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
-    renderpass_bi.pNext = nullptr;
-    renderpass_bi.renderPass = renderpass;
-    renderpass_bi.framebuffer = framebuffer;
-    renderpass_bi.renderArea.offset.x = 0;
-    renderpass_bi.renderArea.offset.y = 0;
-    renderpass_bi.renderArea.extent = render_area;
-    renderpass_bi.clearValueCount = 0;
-    renderpass_bi.pClearValues = nullptr;
+    const VkRenderPassBeginInfo renderpass_bi{
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+        .pNext = nullptr,
+        .renderPass = renderpass,
+        .framebuffer = framebuffer,
+        .renderArea =
+            {
+                .offset = {.x = 0, .y = 0},
+                .extent = render_area,
+            },
+        .clearValueCount = 0,
+        .pClearValues = nullptr,
+    };
 
     Record([renderpass_bi, end_renderpass](vk::CommandBuffer cmdbuf) {
         if (end_renderpass) {
@@ -134,6 +145,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
 }
 
 void VKScheduler::WorkerThread() {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     std::unique_lock lock{mutex};
     do {
         cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
@@ -156,35 +168,58 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) {
 
     current_cmdbuf.End();
 
-    VkSubmitInfo submit_info;
-    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    submit_info.pNext = nullptr;
-    submit_info.waitSemaphoreCount = 0;
-    submit_info.pWaitSemaphores = nullptr;
-    submit_info.pWaitDstStageMask = nullptr;
-    submit_info.commandBufferCount = 1;
-    submit_info.pCommandBuffers = current_cmdbuf.address();
-    submit_info.signalSemaphoreCount = semaphore ? 1 : 0;
-    submit_info.pSignalSemaphores = &semaphore;
-    device.GetGraphicsQueue().Submit(submit_info, *current_fence);
+    const VkSemaphore timeline_semaphore = master_semaphore->Handle();
+    const u32 num_signal_semaphores = semaphore ? 2U : 1U;
+
+    const u64 signal_value = master_semaphore->CurrentTick();
+    const u64 wait_value = signal_value - 1;
+    const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+
+    master_semaphore->NextTick();
+
+    const std::array signal_values{signal_value, u64(0)};
+    const std::array signal_semaphores{timeline_semaphore, semaphore};
+
+    const VkTimelineSemaphoreSubmitInfoKHR timeline_si{
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
+        .pNext = nullptr,
+        .waitSemaphoreValueCount = 1,
+        .pWaitSemaphoreValues = &wait_value,
+        .signalSemaphoreValueCount = num_signal_semaphores,
+        .pSignalSemaphoreValues = signal_values.data(),
+    };
+    const VkSubmitInfo submit_info{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = &timeline_si,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &timeline_semaphore,
+        .pWaitDstStageMask = &wait_stage_mask,
+        .commandBufferCount = 1,
+        .pCommandBuffers = current_cmdbuf.address(),
+        .signalSemaphoreCount = num_signal_semaphores,
+        .pSignalSemaphores = signal_semaphores.data(),
+    };
+    switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) {
+    case VK_SUCCESS:
+        break;
+    case VK_ERROR_DEVICE_LOST:
+        device.ReportLoss();
+        [[fallthrough]];
+    default:
+        vk::Check(result);
+    }
 }
 
 void VKScheduler::AllocateNewContext() {
-    ++ticks;
-
-    VkCommandBufferBeginInfo cmdbuf_bi;
-    cmdbuf_bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    cmdbuf_bi.pNext = nullptr;
-    cmdbuf_bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-    cmdbuf_bi.pInheritanceInfo = nullptr;
-
     std::unique_lock lock{mutex};
-    current_fence = next_fence;
-    next_fence = &resource_manager.CommitFence();
 
-    current_cmdbuf = vk::CommandBuffer(resource_manager.CommitCommandBuffer(*current_fence),
-                                       device.GetDispatchLoader());
-    current_cmdbuf.Begin(cmdbuf_bi);
+    current_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader());
+    current_cmdbuf.Begin({
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .pNext = nullptr,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+        .pInheritanceInfo = nullptr,
+    });
 
     // Enable counters once again. These are disabled when a command buffer is finished.
     if (query_cache) {
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 82a8adc69..7be8a19f0 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -7,7 +7,6 @@
 #include <atomic>
 #include <condition_variable>
 #include <memory>
-#include <optional>
 #include <stack>
 #include <thread>
 #include <utility>
@@ -17,42 +16,33 @@
 
 namespace Vulkan {
 
+class CommandPool;
+class MasterSemaphore;
 class StateTracker;
 class VKDevice;
-class VKFence;
 class VKQueryCache;
-class VKResourceManager;
-
-class VKFenceView {
-public:
-    VKFenceView() = default;
-    VKFenceView(VKFence* const& fence) : fence{fence} {}
-
-    VKFence* operator->() const noexcept {
-        return fence;
-    }
-
-    operator VKFence&() const noexcept {
-        return *fence;
-    }
-
-private:
-    VKFence* const& fence;
-};
 
 /// The scheduler abstracts command buffer and fence management with an interface that's able to do
 /// OpenGL-like operations on Vulkan command buffers.
 class VKScheduler {
 public:
-    explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager,
-                         StateTracker& state_tracker);
+    explicit VKScheduler(const VKDevice& device, StateTracker& state_tracker);
     ~VKScheduler();
 
+    /// Returns the current command buffer tick.
+    [[nodiscard]] u64 CurrentTick() const noexcept;
+
+    /// Returns true when a tick has been triggered by the GPU.
+    [[nodiscard]] bool IsFree(u64 tick) const noexcept;
+
+    /// Waits for the given tick to trigger on the GPU.
+    void Wait(u64 tick);
+
     /// Sends the current execution context to the GPU.
-    void Flush(bool release_fence = true, VkSemaphore semaphore = nullptr);
+    void Flush(VkSemaphore semaphore = nullptr);
 
     /// Sends the current execution context to the GPU and waits for it to complete.
-    void Finish(bool release_fence = true, VkSemaphore semaphore = nullptr);
+    void Finish(VkSemaphore semaphore = nullptr);
 
     /// Waits for the worker thread to finish executing everything. After this function returns it's
     /// safe to touch worker resources.
@@ -87,14 +77,9 @@ public:
         (void)chunk->Record(command);
     }
 
-    /// Gets a reference to the current fence.
-    VKFenceView GetFence() const {
-        return current_fence;
-    }
-
-    /// Returns the current command buffer tick.
-    u64 Ticks() const {
-        return ticks;
+    /// Returns the master timeline semaphore.
+    [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
+        return *master_semaphore;
     }
 
 private:
@@ -172,6 +157,13 @@ private:
         std::array<u8, 0x8000> data{};
     };
 
+    struct State {
+        VkRenderPass renderpass = nullptr;
+        VkFramebuffer framebuffer = nullptr;
+        VkExtent2D render_area = {0, 0};
+        VkPipeline graphics_pipeline = nullptr;
+    };
+
     void WorkerThread();
 
     void SubmitExecution(VkSemaphore semaphore);
@@ -187,30 +179,23 @@ private:
     void AcquireNewChunk();
 
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     StateTracker& state_tracker;
 
+    std::unique_ptr<MasterSemaphore> master_semaphore;
+    std::unique_ptr<CommandPool> command_pool;
+
     VKQueryCache* query_cache = nullptr;
 
     vk::CommandBuffer current_cmdbuf;
-    VKFence* current_fence = nullptr;
-    VKFence* next_fence = nullptr;
-
-    struct State {
-        VkRenderPass renderpass = nullptr;
-        VkFramebuffer framebuffer = nullptr;
-        VkExtent2D render_area = {0, 0};
-        VkPipeline graphics_pipeline = nullptr;
-    } state;
 
     std::unique_ptr<CommandChunk> chunk;
     std::thread worker_thread;
 
+    State state;
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue;
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex mutex;
     std::condition_variable cv;
-    std::atomic<u64> ticks = 0;
     bool quit = false;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index aaa138f52..a20452b87 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -103,8 +103,8 @@ struct GenericVaryingDescription {
 };
 
 spv::Dim GetSamplerDim(const Sampler& sampler) {
-    ASSERT(!sampler.IsBuffer());
-    switch (sampler.GetType()) {
+    ASSERT(!sampler.is_buffer);
+    switch (sampler.type) {
     case Tegra::Shader::TextureType::Texture1D:
         return spv::Dim::Dim1D;
     case Tegra::Shader::TextureType::Texture2D:
@@ -114,13 +114,13 @@ spv::Dim GetSamplerDim(const Sampler& sampler) {
     case Tegra::Shader::TextureType::TextureCube:
         return spv::Dim::Cube;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<u32>(sampler.GetType()));
+        UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<int>(sampler.type));
         return spv::Dim::Dim2D;
     }
 }
 
 std::pair<spv::Dim, bool> GetImageDim(const Image& image) {
-    switch (image.GetType()) {
+    switch (image.type) {
     case Tegra::Shader::ImageType::Texture1D:
         return {spv::Dim::Dim1D, false};
     case Tegra::Shader::ImageType::TextureBuffer:
@@ -134,7 +134,7 @@ std::pair<spv::Dim, bool> GetImageDim(const Image& image) {
     case Tegra::Shader::ImageType::Texture3D:
         return {spv::Dim::Dim3D, false};
     default:
-        UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<u32>(image.GetType()));
+        UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(image.type));
         return {spv::Dim::Dim2D, false};
     }
 }
@@ -272,12 +272,19 @@ bool IsPrecise(Operation operand) {
     return false;
 }
 
+u32 ShaderVersion(const VKDevice& device) {
+    if (device.InstanceApiVersion() < VK_API_VERSION_1_1) {
+        return 0x00010000;
+    }
+    return 0x00010300;
+}
+
 class SPIRVDecompiler final : public Sirit::Module {
 public:
     explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
                              const Registry& registry, const Specialization& specialization)
-        : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
-          registry{registry}, specialization{specialization} {
+        : Module(ShaderVersion(device)), device{device}, ir{ir}, stage{stage},
+          header{ir.GetHeader()}, registry{registry}, specialization{specialization} {
         if (stage != ShaderType::Compute) {
             transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
         }
@@ -293,6 +300,7 @@ public:
         AddCapability(spv::Capability::DrawParameters);
         AddCapability(spv::Capability::SubgroupBallotKHR);
         AddCapability(spv::Capability::SubgroupVoteKHR);
+        AddExtension("SPV_KHR_16bit_storage");
         AddExtension("SPV_KHR_shader_ballot");
         AddExtension("SPV_KHR_subgroup_vote");
         AddExtension("SPV_KHR_storage_buffer_storage_class");
@@ -400,8 +408,9 @@ private:
         u32 binding = specialization.base_binding;
         binding = DeclareConstantBuffers(binding);
         binding = DeclareGlobalBuffers(binding);
-        binding = DeclareTexelBuffers(binding);
+        binding = DeclareUniformTexels(binding);
         binding = DeclareSamplers(binding);
+        binding = DeclareStorageTexels(binding);
         binding = DeclareImages(binding);
 
         const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -515,6 +524,16 @@ private:
     void DeclareCommon() {
         thread_id =
             DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
+        thread_masks[0] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
+        thread_masks[1] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
+        thread_masks[2] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
+        thread_masks[3] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
+        thread_masks[4] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
     }
 
     void DeclareVertex() {
@@ -674,13 +693,19 @@ private:
         }
         t_smem_uint = TypePointer(spv::StorageClass::Workgroup, t_uint);
 
-        const u32 smem_size = specialization.shared_memory_size;
+        u32 smem_size = specialization.shared_memory_size * 4;
         if (smem_size == 0) {
             // Avoid declaring an empty array.
             return;
         }
-        const auto element_count = static_cast<u32>(Common::AlignUp(smem_size, 4) / 4);
-        const Id type_array = TypeArray(t_uint, Constant(t_uint, element_count));
+        const u32 limit = device.GetMaxComputeSharedMemorySize();
+        if (smem_size > limit) {
+            LOG_ERROR(Render_Vulkan, "Shared memory size {} is clamped to host's limit {}",
+                      smem_size, limit);
+            smem_size = limit;
+        }
+
+        const Id type_array = TypeArray(t_uint, Constant(t_uint, smem_size / 4));
         const Id type_pointer = TypePointer(spv::StorageClass::Workgroup, type_array);
         Name(type_pointer, "SharedMemory");
 
@@ -689,9 +714,9 @@ private:
     }
 
     void DeclareInternalFlags() {
-        constexpr std::array names = {"zero", "sign", "carry", "overflow"};
+        static constexpr std::array names{"zero", "sign", "carry", "overflow"};
+
         for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) {
-            const auto flag_code = static_cast<InternalFlag>(flag);
             const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
             internal_flags[flag] = AddGlobalVariable(Name(id, names[flag]));
         }
@@ -731,8 +756,10 @@ private:
             if (!IsGenericAttribute(index)) {
                 continue;
             }
-
             const u32 location = GetGenericAttributeLocation(index);
+            if (!IsAttributeEnabled(location)) {
+                continue;
+            }
             const auto type_descriptor = GetAttributeType(location);
             Id type;
             if (IsInputAttributeArray()) {
@@ -877,13 +904,13 @@ private:
         return binding;
     }
 
-    u32 DeclareTexelBuffers(u32 binding) {
+    u32 DeclareUniformTexels(u32 binding) {
         for (const auto& sampler : ir.GetSamplers()) {
-            if (!sampler.IsBuffer()) {
+            if (!sampler.is_buffer) {
                 continue;
             }
-            ASSERT(!sampler.IsArray());
-            ASSERT(!sampler.IsShadow());
+            ASSERT(!sampler.is_array);
+            ASSERT(!sampler.is_shadow);
 
             constexpr auto dim = spv::Dim::Buffer;
             constexpr int depth = 0;
@@ -894,23 +921,23 @@ private:
             const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format);
             const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
             const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex())));
+            AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.index)));
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
 
-            texel_buffers.emplace(sampler.GetIndex(), TexelBuffer{image_type, id});
+            uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
         }
         return binding;
     }
 
     u32 DeclareSamplers(u32 binding) {
         for (const auto& sampler : ir.GetSamplers()) {
-            if (sampler.IsBuffer()) {
+            if (sampler.is_buffer) {
                 continue;
             }
             const auto dim = GetSamplerDim(sampler);
-            const int depth = sampler.IsShadow() ? 1 : 0;
-            const int arrayed = sampler.IsArray() ? 1 : 0;
+            const int depth = sampler.is_shadow ? 1 : 0;
+            const int arrayed = sampler.is_array ? 1 : 0;
             constexpr bool ms = false;
             constexpr int sampled = 1;
             constexpr auto format = spv::ImageFormat::Unknown;
@@ -918,46 +945,63 @@ private:
             const Id sampler_type = TypeSampledImage(image_type);
             const Id sampler_pointer_type =
                 TypePointer(spv::StorageClass::UniformConstant, sampler_type);
-            const Id type = sampler.IsIndexed()
-                                ? TypeArray(sampler_type, Constant(t_uint, sampler.Size()))
+            const Id type = sampler.is_indexed
+                                ? TypeArray(sampler_type, Constant(t_uint, sampler.size))
                                 : sampler_type;
             const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type);
             const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex())));
+            AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.index)));
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
 
-            sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type,
-                                                                    sampler_pointer_type, id});
+            sampled_images.emplace(
+                sampler.index, SampledImage{image_type, sampler_type, sampler_pointer_type, id});
         }
         return binding;
     }
 
-    u32 DeclareImages(u32 binding) {
+    u32 DeclareStorageTexels(u32 binding) {
         for (const auto& image : ir.GetImages()) {
-            const auto [dim, arrayed] = GetImageDim(image);
-            constexpr int depth = 0;
-            constexpr bool ms = false;
-            constexpr int sampled = 2; // This won't be accessed with a sampler
-            constexpr auto format = spv::ImageFormat::Unknown;
-            const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
-            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
-            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("image_{}", image.GetIndex())));
-
-            Decorate(id, spv::Decoration::Binding, binding++);
-            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
-            if (image.IsRead() && !image.IsWritten()) {
-                Decorate(id, spv::Decoration::NonWritable);
-            } else if (image.IsWritten() && !image.IsRead()) {
-                Decorate(id, spv::Decoration::NonReadable);
+            if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
             }
+            DeclareImage(image, binding);
+        }
+        return binding;
+    }
 
-            images.emplace(static_cast<u32>(image.GetIndex()), StorageImage{image_type, id});
+    u32 DeclareImages(u32 binding) {
+        for (const auto& image : ir.GetImages()) {
+            if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
+            }
+            DeclareImage(image, binding);
         }
         return binding;
     }
 
+    void DeclareImage(const Image& image, u32& binding) {
+        const auto [dim, arrayed] = GetImageDim(image);
+        constexpr int depth = 0;
+        constexpr bool ms = false;
+        constexpr int sampled = 2; // This won't be accessed with a sampler
+        const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
+        const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
+        const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
+        const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+        AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
+
+        Decorate(id, spv::Decoration::Binding, binding++);
+        Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+        if (image.is_read && !image.is_written) {
+            Decorate(id, spv::Decoration::NonWritable);
+        } else if (image.is_written && !image.is_read) {
+            Decorate(id, spv::Decoration::NonReadable);
+        }
+
+        images.emplace(image.index, StorageImage{image_type, id});
+    }
+
     bool IsRenderTargetEnabled(u32 rt) const {
         for (u32 component = 0; component < 4; ++component) {
             if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -976,6 +1020,10 @@ private:
         return stage == ShaderType::TesselationControl;
     }
 
+    bool IsAttributeEnabled(u32 location) const {
+        return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
+    }
+
     u32 GetNumInputVertices() const {
         switch (stage) {
         case ShaderType::Geometry:
@@ -1071,8 +1119,7 @@ private:
 
     void VisitBasicBlock(const NodeBlock& bb) {
         for (const auto& node : bb) {
-            [[maybe_unused]] const Type type = Visit(node).type;
-            ASSERT(type == Type::Void);
+            Visit(node);
         }
     }
 
@@ -1192,16 +1239,20 @@ private:
                 UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
                 return {v_float_zero, Type::Float};
             default:
-                if (IsGenericAttribute(attribute)) {
-                    const u32 location = GetGenericAttributeLocation(attribute);
-                    const auto type_descriptor = GetAttributeType(location);
-                    const Type type = type_descriptor.type;
-                    const Id attribute_id = input_attributes.at(attribute);
-                    const std::vector elements = {element};
-                    const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
-                    return {OpLoad(GetTypeDefinition(type), pointer), type};
+                if (!IsGenericAttribute(attribute)) {
+                    break;
                 }
-                break;
+                const u32 location = GetGenericAttributeLocation(attribute);
+                if (!IsAttributeEnabled(location)) {
+                    // Disabled attributes (also known as constant attributes) always return zero.
+                    return {v_float_zero, Type::Float};
+                }
+                const auto type_descriptor = GetAttributeType(location);
+                const Type type = type_descriptor.type;
+                const Id attribute_id = input_attributes.at(attribute);
+                const std::vector elements = {element};
+                const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
+                return {OpLoad(GetTypeDefinition(type), pointer), type};
             }
             UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
             return {v_float_zero, Type::Float};
@@ -1237,7 +1288,7 @@ private:
                 } else {
                     UNREACHABLE_MSG("Unmanaged offset node type");
                 }
-                pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
+                pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
                                         buffer_element);
             }
             return {OpLoad(t_float, pointer), Type::Float};
@@ -1362,7 +1413,9 @@ private:
         Expression target{};
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit its source
+                // because it might have side effects.
+                Visit(src);
                 return {};
             }
             target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -1584,6 +1637,15 @@ private:
         return {OpCompositeConstruct(t_half, low, high), Type::HalfFloat};
     }
 
+    Expression LogicalAddCarry(Operation operation) {
+        const Id op_a = AsUint(Visit(operation[0]));
+        const Id op_b = AsUint(Visit(operation[1]));
+
+        const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
+        const Id carry = OpCompositeExtract(t_uint, result, 1);
+        return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
+    }
+
     Expression LogicalAssign(Operation operation) {
         const Node& dest = operation[0];
         const Node& src = operation[1];
@@ -1609,13 +1671,31 @@ private:
         return {};
     }
 
+    Expression LogicalFOrdered(Operation operation) {
+        // Emulate SPIR-V's OpOrdered
+        const Id op_a = AsFloat(Visit(operation[0]));
+        const Id op_b = AsFloat(Visit(operation[1]));
+        const Id is_num_a = OpFOrdEqual(t_bool, op_a, op_a);
+        const Id is_num_b = OpFOrdEqual(t_bool, op_b, op_b);
+        return {OpLogicalAnd(t_bool, is_num_a, is_num_b), Type::Bool};
+    }
+
+    Expression LogicalFUnordered(Operation operation) {
+        // Emulate SPIR-V's OpUnordered
+        const Id op_a = AsFloat(Visit(operation[0]));
+        const Id op_b = AsFloat(Visit(operation[1]));
+        const Id is_nan_a = OpIsNan(t_bool, op_a);
+        const Id is_nan_b = OpIsNan(t_bool, op_b);
+        return {OpLogicalOr(t_bool, is_nan_a, is_nan_b), Type::Bool};
+    }
+
     Id GetTextureSampler(Operation operation) {
         const auto& meta = std::get<MetaTexture>(operation.GetMeta());
-        ASSERT(!meta.sampler.IsBuffer());
+        ASSERT(!meta.sampler.is_buffer);
 
-        const auto& entry = sampled_images.at(meta.sampler.GetIndex());
+        const auto& entry = sampled_images.at(meta.sampler.index);
         Id sampler = entry.variable;
-        if (meta.sampler.IsIndexed()) {
+        if (meta.sampler.is_indexed) {
             const Id index = AsInt(Visit(meta.index));
             sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index);
         }
@@ -1624,9 +1704,9 @@ private:
 
     Id GetTextureImage(Operation operation) {
         const auto& meta = std::get<MetaTexture>(operation.GetMeta());
-        const u32 index = meta.sampler.GetIndex();
-        if (meta.sampler.IsBuffer()) {
-            const auto& entry = texel_buffers.at(index);
+        const u32 index = meta.sampler.index;
+        if (meta.sampler.is_buffer) {
+            const auto& entry = uniform_texels.at(index);
             return OpLoad(entry.image_type, entry.image);
         } else {
             const auto& entry = sampled_images.at(index);
@@ -1636,7 +1716,7 @@ private:
 
     Id GetImage(Operation operation) {
         const auto& meta = std::get<MetaImage>(operation.GetMeta());
-        const auto entry = images.at(meta.image.GetIndex());
+        const auto entry = images.at(meta.image.index);
         return OpLoad(entry.image_type, entry.image);
     }
 
@@ -1652,7 +1732,7 @@ private:
         }
         if (const auto meta = std::get_if<MetaTexture>(&operation.GetMeta())) {
             // Add array coordinate for textures
-            if (meta->sampler.IsArray()) {
+            if (meta->sampler.is_array) {
                 Id array = AsInt(Visit(meta->array));
                 if (type == Type::Float) {
                     array = OpConvertSToF(t_float, array);
@@ -1758,7 +1838,7 @@ private:
             operands.push_back(GetOffsetCoordinates(operation));
         }
 
-        if (meta.sampler.IsShadow()) {
+        if (meta.sampler.is_shadow) {
             const Id dref = AsFloat(Visit(meta.depth_compare));
             return {OpImageSampleDrefExplicitLod(t_float, sampler, coords, dref, mask, operands),
                     Type::Float};
@@ -1773,7 +1853,7 @@ private:
 
         const Id coords = GetCoordinates(operation, Type::Float);
         Id texture{};
-        if (meta.sampler.IsShadow()) {
+        if (meta.sampler.is_shadow) {
             texture = OpImageDrefGather(t_float4, GetTextureSampler(operation), coords,
                                         AsFloat(Visit(meta.depth_compare)));
         } else {
@@ -1800,8 +1880,8 @@ private:
         }
 
         const Id lod = AsUint(Visit(operation[0]));
-        const std::size_t coords_count = [&]() {
-            switch (const auto type = meta.sampler.GetType(); type) {
+        const std::size_t coords_count = [&meta] {
+            switch (const auto type = meta.sampler.type) {
             case Tegra::Shader::TextureType::Texture1D:
                 return 1;
             case Tegra::Shader::TextureType::Texture2D:
@@ -1810,7 +1890,7 @@ private:
             case Tegra::Shader::TextureType::Texture3D:
                 return 3;
             default:
-                UNREACHABLE_MSG("Invalid texture type={}", static_cast<u32>(type));
+                UNREACHABLE_MSG("Invalid texture type={}", static_cast<int>(type));
                 return 2;
             }
         }();
@@ -1853,7 +1933,7 @@ private:
         const Id image = GetTextureImage(operation);
         const Id coords = GetCoordinates(operation, Type::Int);
         Id fetch;
-        if (meta.lod && !meta.sampler.IsBuffer()) {
+        if (meta.lod && !meta.sampler.is_buffer) {
             fetch = OpImageFetch(t_float4, image, coords, spv::ImageOperandsMask::Lod,
                                  AsInt(Visit(meta.lod)));
         } else {
@@ -1903,39 +1983,20 @@ private:
         return {};
     }
 
-    Expression AtomicImageAdd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMin(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMax(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageAnd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageOr(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression AtomicImage(Operation operation) {
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+        ASSERT(meta.values.size() == 1);
 
-    Expression AtomicImageXor(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+        const Id coordinate = GetCoordinates(operation, Type::Int);
+        const Id image = images.at(meta.image.index).image;
+        const Id sample = v_uint_zero;
+        const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
 
-    Expression AtomicImageExchange(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
+        const Id semantics = v_uint_zero;
+        const Id value = AsUint(Visit(meta.values[0]));
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
     }
 
     template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1950,7 +2011,7 @@ private:
             return {v_float_zero, Type::Float};
         }
         const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(t_uint, 0);
+        const Id semantics = v_uint_zero;
         const Id value = AsUint(Visit(operation[1]));
 
         return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2148,14 +2209,37 @@ private:
         return {OpLoad(t_uint, thread_id), Type::Uint};
     }
 
+    template <std::size_t index>
+    Expression ThreadMask(Operation) {
+        // TODO(Rodrigo): Handle devices with different warp sizes
+        const Id mask = thread_masks[index];
+        return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
+    }
+
     Expression ShuffleIndexed(Operation operation) {
         const Id value = AsFloat(Visit(operation[0]));
         const Id index = AsUint(Visit(operation[1]));
         return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
     }
 
-    Expression MemoryBarrierGL(Operation) {
-        const auto scope = spv::Scope::Device;
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
+            return {};
+        }
+
+        const auto scope = spv::Scope::Workgroup;
+        const auto memory = spv::Scope::Workgroup;
+        const auto semantics =
+            spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
+        OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
+                         Constant(t_uint, static_cast<u32>(memory)),
+                         Constant(t_uint, static_cast<u32>(semantics)));
+        return {};
+    }
+
+    template <spv::Scope scope>
+    Expression MemoryBarrier(Operation) {
         const auto semantics =
             spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
             spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2502,7 +2586,14 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::Float>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::Float>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::Float>,
-        &SPIRVDecompiler::Unary<&Module::OpIsNan, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::LogicalFOrdered,
+        &SPIRVDecompiler::LogicalFUnordered,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordLessThan, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordEqual, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordLessThanEqual, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordGreaterThan, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordNotEqual, Type::Bool, Type::Float>,
+        &SPIRVDecompiler::Binary<&Module::OpFUnordGreaterThanEqual, Type::Bool, Type::Float>,
 
         &SPIRVDecompiler::Binary<&Module::OpSLessThan, Type::Bool, Type::Int>,
         &SPIRVDecompiler::Binary<&Module::OpIEqual, Type::Bool, Type::Int>,
@@ -2518,6 +2609,8 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpINotEqual, Type::Bool, Type::Uint>,
         &SPIRVDecompiler::Binary<&Module::OpUGreaterThanEqual, Type::Bool, Type::Uint>,
 
+        &SPIRVDecompiler::LogicalAddCarry,
+
         &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool2, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool2, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool2, Type::HalfFloat>,
@@ -2542,11 +2635,11 @@ private:
 
         &SPIRVDecompiler::ImageLoad,
         &SPIRVDecompiler::ImageStore,
-        &SPIRVDecompiler::AtomicImageAdd,
-        &SPIRVDecompiler::AtomicImageAnd,
-        &SPIRVDecompiler::AtomicImageOr,
-        &SPIRVDecompiler::AtomicImageXor,
-        &SPIRVDecompiler::AtomicImageExchange,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
 
         &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
         &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2603,9 +2696,16 @@ private:
         &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
 
         &SPIRVDecompiler::ThreadId,
+        &SPIRVDecompiler::ThreadMask<0>, // Eq
+        &SPIRVDecompiler::ThreadMask<1>, // Ge
+        &SPIRVDecompiler::ThreadMask<2>, // Gt
+        &SPIRVDecompiler::ThreadMask<3>, // Le
+        &SPIRVDecompiler::ThreadMask<4>, // Lt
         &SPIRVDecompiler::ShuffleIndexed,
 
-        &SPIRVDecompiler::MemoryBarrierGL,
+        &SPIRVDecompiler::Barrier,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
@@ -2681,8 +2781,11 @@ private:
         Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
     const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
 
+    const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
+
     const Id v_float_zero = Constant(t_float, 0.0f);
     const Id v_float_one = Constant(t_float, 1.0f);
+    const Id v_uint_zero = Constant(t_uint, 0);
 
     // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
     const Id v_varying_default =
@@ -2707,15 +2810,15 @@ private:
     std::unordered_map<u8, GenericVaryingDescription> output_attributes;
     std::map<u32, Id> constant_buffers;
     std::map<GlobalMemoryBase, Id> global_buffers;
-    std::map<u32, TexelBuffer> texel_buffers;
+    std::map<u32, TexelBuffer> uniform_texels;
     std::map<u32, SampledImage> sampled_images;
     std::map<u32, StorageImage> images;
 
+    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id instance_index{};
     Id vertex_index{};
     Id base_instance{};
     Id base_vertex{};
-    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
     Id front_facing{};
@@ -2727,6 +2830,7 @@ private:
     Id workgroup_id{};
     Id local_invocation_id{};
     Id thread_id{};
+    std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
 
     VertexIndices in_indices;
     VertexIndices out_indices;
@@ -2969,14 +3073,18 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
         entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_written);
     }
     for (const auto& sampler : ir.GetSamplers()) {
-        if (sampler.IsBuffer()) {
-            entries.texel_buffers.emplace_back(sampler);
+        if (sampler.is_buffer) {
+            entries.uniform_texels.emplace_back(sampler);
         } else {
             entries.samplers.emplace_back(sampler);
         }
     }
     for (const auto& image : ir.GetImages()) {
-        entries.images.emplace_back(image);
+        if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+            entries.storage_texels.emplace_back(image);
+        } else {
+            entries.images.emplace_back(image);
+        }
     }
     for (const auto& attribute : ir.GetInputAttributes()) {
         if (IsGenericAttribute(attribute)) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index ffea4709e..2b0e90396 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -5,11 +5,7 @@
 #pragma once
 
 #include <array>
-#include <bitset>
-#include <memory>
 #include <set>
-#include <type_traits>
-#include <utility>
 #include <vector>
 
 #include "common/common_types.h"
@@ -25,8 +21,9 @@ class VKDevice;
 namespace Vulkan {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using TexelBufferEntry = VideoCommon::Shader::Sampler;
+using UniformTexelEntry = VideoCommon::Shader::Sampler;
 using SamplerEntry = VideoCommon::Shader::Sampler;
+using StorageTexelEntry = VideoCommon::Shader::Image;
 using ImageEntry = VideoCommon::Shader::Image;
 
 constexpr u32 DESCRIPTOR_SET = 0;
@@ -70,13 +67,15 @@ private:
 struct ShaderEntries {
     u32 NumBindings() const {
         return static_cast<u32>(const_buffers.size() + global_buffers.size() +
-                                texel_buffers.size() + samplers.size() + images.size());
+                                uniform_texels.size() + samplers.size() + storage_texels.size() +
+                                images.size());
     }
 
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<GlobalBufferEntry> global_buffers;
-    std::vector<TexelBufferEntry> texel_buffers;
+    std::vector<UniformTexelEntry> uniform_texels;
     std::vector<SamplerEntry> samplers;
+    std::vector<StorageTexelEntry> storage_texels;
     std::vector<ImageEntry> images;
     std::set<u32> attributes;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -92,7 +91,8 @@ struct Specialization final {
     u32 shared_memory_size{};
 
     // Graphics specific
-    std::optional<float> point_size{};
+    std::optional<float> point_size;
+    std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
     std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
     bool ndc_minus_one_to_one{};
 };
diff --git a/src/video_core/renderer_vulkan/vk_shader_util.cpp b/src/video_core/renderer_vulkan/vk_shader_util.cpp
index 784839327..c1a218d76 100644
--- a/src/video_core/renderer_vulkan/vk_shader_util.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_util.cpp
@@ -4,8 +4,7 @@
 
 #include <cstring>
 #include <memory>
-#include <vector>
-#include "common/alignment.h"
+
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_device.h"
@@ -20,13 +19,13 @@ vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, cons
     const auto data = std::make_unique<u32[]>(code_size / sizeof(u32));
     std::memcpy(data.get(), code_data, code_size);
 
-    VkShaderModuleCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.codeSize = code_size;
-    ci.pCode = data.get();
-    return device.GetLogical().CreateShaderModule(ci);
+    return device.GetLogical().CreateShaderModule({
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .codeSize = code_size,
+        .pCode = data.get(),
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_util.h b/src/video_core/renderer_vulkan/vk_shader_util.h
index be38d6697..d1d3f3cae 100644
--- a/src/video_core/renderer_vulkan/vk_shader_util.h
+++ b/src/video_core/renderer_vulkan/vk_shader_util.h
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include <vector>
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 94d954d7a..2fd3b7f39 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -10,37 +10,18 @@
 #include "common/bit_util.h"
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
-VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence,
-                                                  u64 last_epoch)
-    : buffer{std::move(buffer)}, watch{fence}, last_epoch{last_epoch} {}
+VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer_)
+    : buffer{std::move(buffer_)} {}
 
-VKStagingBufferPool::StagingBuffer::StagingBuffer(StagingBuffer&& rhs) noexcept {
-    buffer = std::move(rhs.buffer);
-    watch = std::move(rhs.watch);
-    last_epoch = rhs.last_epoch;
-}
-
-VKStagingBufferPool::StagingBuffer::~StagingBuffer() = default;
-
-VKStagingBufferPool::StagingBuffer& VKStagingBufferPool::StagingBuffer::operator=(
-    StagingBuffer&& rhs) noexcept {
-    buffer = std::move(rhs.buffer);
-    watch = std::move(rhs.watch);
-    last_epoch = rhs.last_epoch;
-    return *this;
-}
-
-VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager,
-                                         VKScheduler& scheduler)
-    : device{device}, memory_manager{memory_manager}, scheduler{scheduler},
-      is_device_integrated{device.IsIntegrated()} {}
+VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device_, VKMemoryManager& memory_manager_,
+                                         VKScheduler& scheduler_)
+    : device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_} {}
 
 VKStagingBufferPool::~VKStagingBufferPool() = default;
 
@@ -52,21 +33,19 @@ VKBuffer& VKStagingBufferPool::GetUnusedBuffer(std::size_t size, bool host_visib
 }
 
 void VKStagingBufferPool::TickFrame() {
-    ++epoch;
     current_delete_level = (current_delete_level + 1) % NumLevels;
 
     ReleaseCache(true);
-    if (!is_device_integrated) {
-        ReleaseCache(false);
-    }
+    ReleaseCache(false);
 }
 
 VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_visible) {
-    for (auto& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) {
-        if (entry.watch.TryWatch(scheduler.GetFence())) {
-            entry.last_epoch = epoch;
-            return &*entry.buffer;
+    for (StagingBuffer& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) {
+        if (!scheduler.IsFree(entry.tick)) {
+            continue;
         }
+        entry.tick = scheduler.CurrentTick();
+        return &*entry.buffer;
     }
     return nullptr;
 }
@@ -74,28 +53,29 @@ VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_
 VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) {
     const u32 log2 = Common::Log2Ceil64(size);
 
-    VkBufferCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.size = 1ULL << log2;
-    ci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-               VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-               VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
-
     auto buffer = std::make_unique<VKBuffer>();
-    buffer->handle = device.GetLogical().CreateBuffer(ci);
+    buffer->handle = device.GetLogical().CreateBuffer({
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = 1ULL << log2,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                 VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
     buffer->commit = memory_manager.Commit(buffer->handle, host_visible);
 
-    auto& entries = GetCache(host_visible)[log2].entries;
-    return *entries.emplace_back(std::move(buffer), scheduler.GetFence(), epoch).buffer;
+    std::vector<StagingBuffer>& entries = GetCache(host_visible)[log2].entries;
+    StagingBuffer& entry = entries.emplace_back(std::move(buffer));
+    entry.tick = scheduler.CurrentTick();
+    return *entry.buffer;
 }
 
 VKStagingBufferPool::StagingBuffersCache& VKStagingBufferPool::GetCache(bool host_visible) {
-    return is_device_integrated || host_visible ? host_staging_buffers : device_staging_buffers;
+    return host_visible ? host_staging_buffers : device_staging_buffers;
 }
 
 void VKStagingBufferPool::ReleaseCache(bool host_visible) {
@@ -113,9 +93,8 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo
     auto& entries = staging.entries;
     const std::size_t old_size = entries.size();
 
-    const auto is_deleteable = [this](const auto& entry) {
-        static constexpr u64 epochs_to_destroy = 180;
-        return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed();
+    const auto is_deleteable = [this](const StagingBuffer& entry) {
+        return scheduler.IsFree(entry.tick);
     };
     const std::size_t begin_offset = staging.delete_index;
     const std::size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size);
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
index a0840ff8c..2dd5049ac 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
@@ -5,20 +5,16 @@
 #pragma once
 
 #include <climits>
-#include <unordered_map>
-#include <utility>
 #include <vector>
 
 #include "common/common_types.h"
 
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Vulkan {
 
 class VKDevice;
-class VKFenceWatch;
 class VKScheduler;
 
 struct VKBuffer final {
@@ -38,16 +34,10 @@ public:
 
 private:
     struct StagingBuffer final {
-        explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, u64 last_epoch);
-        StagingBuffer(StagingBuffer&& rhs) noexcept;
-        StagingBuffer(const StagingBuffer&) = delete;
-        ~StagingBuffer();
-
-        StagingBuffer& operator=(StagingBuffer&& rhs) noexcept;
+        explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer);
 
         std::unique_ptr<VKBuffer> buffer;
-        VKFenceWatch watch;
-        u64 last_epoch = 0;
+        u64 tick = 0;
     };
 
     struct StagingBuffers final {
@@ -71,13 +61,10 @@ private:
     const VKDevice& device;
     VKMemoryManager& memory_manager;
     VKScheduler& scheduler;
-    const bool is_device_integrated;
 
     StagingBuffersCache host_staging_buffers;
     StagingBuffersCache device_staging_buffers;
 
-    u64 epoch = 0;
-
     std::size_t current_delete_level = 0;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index 94a89e388..5d2c4a796 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -36,6 +36,14 @@ Flags MakeInvalidationFlags() {
     flags[BlendConstants] = true;
     flags[DepthBounds] = true;
     flags[StencilProperties] = true;
+    flags[CullMode] = true;
+    flags[DepthBoundsEnable] = true;
+    flags[DepthTestEnable] = true;
+    flags[DepthWriteEnable] = true;
+    flags[DepthCompareOp] = true;
+    flags[FrontFace] = true;
+    flags[StencilOp] = true;
+    flags[StencilTestEnable] = true;
     return flags;
 }
 
@@ -75,14 +83,58 @@ void SetupDirtyStencilProperties(Tables& tables) {
     table[OFF(stencil_back_func_mask)] = StencilProperties;
 }
 
-} // Anonymous namespace
+void SetupDirtyCullMode(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(cull_face)] = CullMode;
+    table[OFF(cull_test_enabled)] = CullMode;
+}
+
+void SetupDirtyDepthBoundsEnable(Tables& tables) {
+    tables[0][OFF(depth_bounds_enable)] = DepthBoundsEnable;
+}
+
+void SetupDirtyDepthTestEnable(Tables& tables) {
+    tables[0][OFF(depth_test_enable)] = DepthTestEnable;
+}
+
+void SetupDirtyDepthWriteEnable(Tables& tables) {
+    tables[0][OFF(depth_write_enabled)] = DepthWriteEnable;
+}
+
+void SetupDirtyDepthCompareOp(Tables& tables) {
+    tables[0][OFF(depth_test_func)] = DepthCompareOp;
+}
 
-StateTracker::StateTracker(Core::System& system)
-    : system{system}, invalidation_flags{MakeInvalidationFlags()} {}
+void SetupDirtyFrontFace(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(front_face)] = FrontFace;
+    table[OFF(screen_y_control)] = FrontFace;
+}
+
+void SetupDirtyStencilOp(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(stencil_front_op_fail)] = StencilOp;
+    table[OFF(stencil_front_op_zfail)] = StencilOp;
+    table[OFF(stencil_front_op_zpass)] = StencilOp;
+    table[OFF(stencil_front_func_func)] = StencilOp;
+    table[OFF(stencil_back_op_fail)] = StencilOp;
+    table[OFF(stencil_back_op_zfail)] = StencilOp;
+    table[OFF(stencil_back_op_zpass)] = StencilOp;
+    table[OFF(stencil_back_func_func)] = StencilOp;
+
+    // Table 0 is used by StencilProperties
+    tables[1][OFF(stencil_two_side_enable)] = StencilOp;
+}
 
-void StateTracker::Initialize() {
-    auto& dirty = system.GPU().Maxwell3D().dirty;
-    auto& tables = dirty.tables;
+void SetupDirtyStencilTestEnable(Tables& tables) {
+    tables[0][OFF(stencil_enable)] = StencilTestEnable;
+}
+
+} // Anonymous namespace
+
+StateTracker::StateTracker(Tegra::GPU& gpu)
+    : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
+    auto& tables = gpu.Maxwell3D().dirty.tables;
     SetupDirtyRenderTargets(tables);
     SetupDirtyViewports(tables);
     SetupDirtyScissors(tables);
@@ -90,10 +142,14 @@ void StateTracker::Initialize() {
     SetupDirtyBlendConstants(tables);
     SetupDirtyDepthBounds(tables);
     SetupDirtyStencilProperties(tables);
-}
-
-void StateTracker::InvalidateCommandBufferState() {
-    system.GPU().Maxwell3D().dirty.flags |= invalidation_flags;
+    SetupDirtyCullMode(tables);
+    SetupDirtyDepthBoundsEnable(tables);
+    SetupDirtyDepthTestEnable(tables);
+    SetupDirtyDepthWriteEnable(tables);
+    SetupDirtyDepthCompareOp(tables);
+    SetupDirtyFrontFace(tables);
+    SetupDirtyStencilOp(tables);
+    SetupDirtyStencilTestEnable(tables);
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index 03bc415b2..1de789e57 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -26,6 +26,15 @@ enum : u8 {
     DepthBounds,
     StencilProperties,
 
+    CullMode,
+    DepthBoundsEnable,
+    DepthTestEnable,
+    DepthWriteEnable,
+    DepthCompareOp,
+    FrontFace,
+    StencilOp,
+    StencilTestEnable,
+
     Last
 };
 static_assert(Last <= std::numeric_limits<u8>::max());
@@ -33,12 +42,15 @@ static_assert(Last <= std::numeric_limits<u8>::max());
 } // namespace Dirty
 
 class StateTracker {
-public:
-    explicit StateTracker(Core::System& system);
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-    void Initialize();
+public:
+    explicit StateTracker(Tegra::GPU& gpu);
 
-    void InvalidateCommandBufferState();
+    void InvalidateCommandBufferState() {
+        flags |= invalidation_flags;
+        current_topology = INVALID_TOPOLOGY;
+    }
 
     bool TouchViewports() {
         return Exchange(Dirty::Viewports, false);
@@ -64,16 +76,60 @@ public:
         return Exchange(Dirty::StencilProperties, false);
     }
 
+    bool TouchCullMode() {
+        return Exchange(Dirty::CullMode, false);
+    }
+
+    bool TouchDepthBoundsTestEnable() {
+        return Exchange(Dirty::DepthBoundsEnable, false);
+    }
+
+    bool TouchDepthTestEnable() {
+        return Exchange(Dirty::DepthTestEnable, false);
+    }
+
+    bool TouchDepthBoundsEnable() {
+        return Exchange(Dirty::DepthBoundsEnable, false);
+    }
+
+    bool TouchDepthWriteEnable() {
+        return Exchange(Dirty::DepthWriteEnable, false);
+    }
+
+    bool TouchDepthCompareOp() {
+        return Exchange(Dirty::DepthCompareOp, false);
+    }
+
+    bool TouchFrontFace() {
+        return Exchange(Dirty::FrontFace, false);
+    }
+
+    bool TouchStencilOp() {
+        return Exchange(Dirty::StencilOp, false);
+    }
+
+    bool TouchStencilTestEnable() {
+        return Exchange(Dirty::StencilTestEnable, false);
+    }
+
+    bool ChangePrimitiveTopology(Maxwell::PrimitiveTopology new_topology) {
+        const bool has_changed = current_topology != new_topology;
+        current_topology = new_topology;
+        return has_changed;
+    }
+
 private:
+    static constexpr auto INVALID_TOPOLOGY = static_cast<Maxwell::PrimitiveTopology>(~0u);
+
     bool Exchange(std::size_t id, bool new_value) const noexcept {
-        auto& flags = system.GPU().Maxwell3D().dirty.flags;
         const bool is_dirty = flags[id];
         flags[id] = new_value;
         return is_dirty;
     }
 
-    Core::System& system;
+    Tegra::Engines::Maxwell3D::DirtyState::Flags& flags;
     Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags;
+    Maxwell::PrimitiveTopology current_topology = INVALID_TOPOLOGY;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
index 38a93a01a..1b59612b9 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <limits>
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -10,7 +11,6 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/renderer_vulkan/wrapper.h"
@@ -22,27 +22,43 @@ namespace {
 constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
 constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
 
-constexpr u64 STREAM_BUFFER_SIZE = 256 * 1024 * 1024;
+constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256 * 1024 * 1024;
 
-std::optional<u32> FindMemoryType(const VKDevice& device, u32 filter,
-                                  VkMemoryPropertyFlags wanted) {
-    const auto properties = device.GetPhysical().GetMemoryProperties();
-    for (u32 i = 0; i < properties.memoryTypeCount; i++) {
-        if (!(filter & (1 << i))) {
-            continue;
-        }
-        if ((properties.memoryTypes[i].propertyFlags & wanted) == wanted) {
+/// Find a memory type with the passed requirements
+std::optional<u32> FindMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
+                                  VkMemoryPropertyFlags wanted,
+                                  u32 filter = std::numeric_limits<u32>::max()) {
+    for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
+        const auto flags = properties.memoryTypes[i].propertyFlags;
+        if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) {
             return i;
         }
     }
     return std::nullopt;
 }
 
+/// Get the preferred host visible memory type.
+u32 GetMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
+                  u32 filter = std::numeric_limits<u32>::max()) {
+    // Prefer device local host visible allocations. Both AMD and Nvidia now provide one.
+    // Otherwise search for a host visible allocation.
+    static constexpr auto HOST_MEMORY =
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    static constexpr auto DYNAMIC_MEMORY = HOST_MEMORY | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+
+    std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY);
+    if (!preferred_type) {
+        preferred_type = FindMemoryType(properties, HOST_MEMORY);
+        ASSERT_MSG(preferred_type, "No host visible and coherent memory type found");
+    }
+    return preferred_type.value_or(0);
+}
+
 } // Anonymous namespace
 
-VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler,
+VKStreamBuffer::VKStreamBuffer(const VKDevice& device_, VKScheduler& scheduler_,
                                VkBufferUsageFlags usage)
-    : device{device}, scheduler{scheduler} {
+    : device{device_}, scheduler{scheduler_} {
     CreateBuffers(usage);
     ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
     ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
@@ -51,7 +67,7 @@ VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler,
 VKStreamBuffer::~VKStreamBuffer() = default;
 
 std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) {
-    ASSERT(size <= STREAM_BUFFER_SIZE);
+    ASSERT(size <= stream_buffer_size);
     mapped_size = size;
 
     if (alignment > 0) {
@@ -61,7 +77,7 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) {
     WaitPendingOperations(offset);
 
     bool invalidated = false;
-    if (offset + size > STREAM_BUFFER_SIZE) {
+    if (offset + size > stream_buffer_size) {
         // The buffer would overflow, save the amount of used watches and reset the state.
         invalidation_mark = current_watch_cursor;
         current_watch_cursor = 0;
@@ -94,44 +110,39 @@ void VKStreamBuffer::Unmap(u64 size) {
     }
     auto& watch = current_watches[current_watch_cursor++];
     watch.upper_bound = offset;
-    watch.fence.Watch(scheduler.GetFence());
+    watch.tick = scheduler.CurrentTick();
 }
 
 void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) {
-    VkBufferCreateInfo buffer_ci;
-    buffer_ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    buffer_ci.pNext = nullptr;
-    buffer_ci.flags = 0;
-    buffer_ci.size = STREAM_BUFFER_SIZE;
-    buffer_ci.usage = usage;
-    buffer_ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    buffer_ci.queueFamilyIndexCount = 0;
-    buffer_ci.pQueueFamilyIndices = nullptr;
-
-    const auto& dev = device.GetLogical();
-    buffer = dev.CreateBuffer(buffer_ci);
-
-    const auto& dld = device.GetDispatchLoader();
-    const auto requirements = dev.GetBufferMemoryRequirements(*buffer);
-    // Prefer device local host visible allocations (this should hit AMD's pinned memory).
-    auto type =
-        FindMemoryType(device, requirements.memoryTypeBits,
-                       VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                           VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-    if (!type) {
-        // Otherwise search for a host visible allocation.
-        type = FindMemoryType(device, requirements.memoryTypeBits,
-                              VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
-        ASSERT_MSG(type, "No host visible and coherent memory type found");
-    }
-    VkMemoryAllocateInfo memory_ai;
-    memory_ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    memory_ai.pNext = nullptr;
-    memory_ai.allocationSize = requirements.size;
-    memory_ai.memoryTypeIndex = *type;
-
-    memory = dev.AllocateMemory(memory_ai);
+    const auto memory_properties = device.GetPhysical().GetMemoryProperties();
+    const u32 preferred_type = GetMemoryType(memory_properties);
+    const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex;
+
+    // Substract from the preferred heap size some bytes to avoid getting out of memory.
+    const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
+    // As per DXVK's example, using `heap_size / 2`
+    const VkDeviceSize allocable_size = heap_size / 2;
+    buffer = device.GetLogical().CreateBuffer({
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = std::min(PREFERRED_STREAM_BUFFER_SIZE, allocable_size),
+        .usage = usage,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
+
+    const auto requirements = device.GetLogical().GetBufferMemoryRequirements(*buffer);
+    const u32 required_flags = requirements.memoryTypeBits;
+    stream_buffer_size = static_cast<u64>(requirements.size);
+
+    memory = device.GetLogical().AllocateMemory({
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .allocationSize = requirements.size,
+        .memoryTypeIndex = GetMemoryType(memory_properties, required_flags),
+    });
     buffer.BindMemory(*memory, 0);
 }
 
@@ -146,7 +157,7 @@ void VKStreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
     while (requested_upper_bound < wait_bound && wait_cursor < *invalidation_mark) {
         auto& watch = previous_watches[wait_cursor];
         wait_bound = watch.upper_bound;
-        watch.fence.Wait();
+        scheduler.Wait(watch.tick);
         ++wait_cursor;
     }
 }
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index 58ce8b973..5e15ad78f 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -14,7 +14,6 @@
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
 class VKFenceWatch;
 class VKScheduler;
 
@@ -35,13 +34,17 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const noexcept {
         return *buffer;
     }
 
+    u64 Address() const noexcept {
+        return 0;
+    }
+
 private:
-    struct Watch final {
-        VKFenceWatch fence;
+    struct Watch {
+        u64 tick{};
         u64 upper_bound{};
     };
 
@@ -56,8 +59,9 @@ private:
     const VKDevice& device; ///< Vulkan device manager.
     VKScheduler& scheduler; ///< Command scheduler.
 
-    vk::Buffer buffer;       ///< Mapped buffer.
-    vk::DeviceMemory memory; ///< Memory allocation.
+    vk::Buffer buffer;        ///< Mapped buffer.
+    vk::DeviceMemory memory;  ///< Memory allocation.
+    u64 stream_buffer_size{}; ///< Stream buffer size.
 
     u64 offset{};      ///< Buffer iterator.
     u64 mapped_size{}; ///< Size reserved for the current copy.
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index bffd8f32a..9636a7c65 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -12,7 +12,7 @@
 #include "core/core.h"
 #include "core/frontend/framebuffer_layout.h"
 #include "video_core/renderer_vulkan/vk_device.h"
-#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
@@ -56,8 +56,8 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi
 
 } // Anonymous namespace
 
-VKSwapchain::VKSwapchain(VkSurfaceKHR surface, const VKDevice& device)
-    : surface{surface}, device{device} {}
+VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const VKDevice& device_, VKScheduler& scheduler_)
+    : surface{surface_}, device{device_}, scheduler{scheduler_} {}
 
 VKSwapchain::~VKSwapchain() = default;
 
@@ -75,35 +75,33 @@ void VKSwapchain::Create(u32 width, u32 height, bool srgb) {
     CreateSemaphores();
     CreateImageViews();
 
-    fences.resize(image_count, nullptr);
+    resource_ticks.clear();
+    resource_ticks.resize(image_count);
 }
 
 void VKSwapchain::AcquireNextImage() {
     device.GetLogical().AcquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(),
                                             *present_semaphores[frame_index], {}, &image_index);
 
-    if (auto& fence = fences[image_index]; fence) {
-        fence->Wait();
-        fence->Release();
-        fence = nullptr;
-    }
+    scheduler.Wait(resource_ticks[image_index]);
 }
 
-bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) {
+bool VKSwapchain::Present(VkSemaphore render_semaphore) {
     const VkSemaphore present_semaphore{*present_semaphores[frame_index]};
     const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore};
     const auto present_queue{device.GetPresentQueue()};
     bool recreated = false;
 
-    VkPresentInfoKHR present_info;
-    present_info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-    present_info.pNext = nullptr;
-    present_info.waitSemaphoreCount = render_semaphore ? 2U : 1U;
-    present_info.pWaitSemaphores = semaphores.data();
-    present_info.swapchainCount = 1;
-    present_info.pSwapchains = swapchain.address();
-    present_info.pImageIndices = &image_index;
-    present_info.pResults = nullptr;
+    const VkPresentInfoKHR present_info{
+        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+        .pNext = nullptr,
+        .waitSemaphoreCount = render_semaphore ? 2U : 1U,
+        .pWaitSemaphores = semaphores.data(),
+        .swapchainCount = 1,
+        .pSwapchains = swapchain.address(),
+        .pImageIndices = &image_index,
+        .pResults = nullptr,
+    };
 
     switch (const VkResult result = present_queue.Present(present_info)) {
     case VK_SUCCESS:
@@ -122,8 +120,7 @@ bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) {
         break;
     }
 
-    ASSERT(fences[image_index] == nullptr);
-    fences[image_index] = &fence;
+    resource_ticks[image_index] = scheduler.CurrentTick();
     frame_index = (frame_index + 1) % static_cast<u32>(image_count);
     return recreated;
 }
@@ -147,24 +144,26 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
         requested_image_count = capabilities.maxImageCount;
     }
 
-    VkSwapchainCreateInfoKHR swapchain_ci;
-    swapchain_ci.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
-    swapchain_ci.pNext = nullptr;
-    swapchain_ci.flags = 0;
-    swapchain_ci.surface = surface;
-    swapchain_ci.minImageCount = requested_image_count;
-    swapchain_ci.imageFormat = surface_format.format;
-    swapchain_ci.imageColorSpace = surface_format.colorSpace;
-    swapchain_ci.imageArrayLayers = 1;
-    swapchain_ci.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-    swapchain_ci.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    swapchain_ci.queueFamilyIndexCount = 0;
-    swapchain_ci.pQueueFamilyIndices = nullptr;
-    swapchain_ci.preTransform = capabilities.currentTransform;
-    swapchain_ci.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
-    swapchain_ci.presentMode = present_mode;
-    swapchain_ci.clipped = VK_FALSE;
-    swapchain_ci.oldSwapchain = nullptr;
+    VkSwapchainCreateInfoKHR swapchain_ci{
+        .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .flags = 0,
+        .surface = surface,
+        .minImageCount = requested_image_count,
+        .imageFormat = surface_format.format,
+        .imageColorSpace = surface_format.colorSpace,
+        .imageExtent = {},
+        .imageArrayLayers = 1,
+        .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+        .preTransform = capabilities.currentTransform,
+        .compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,
+        .presentMode = present_mode,
+        .clipped = VK_FALSE,
+        .oldSwapchain = nullptr,
+    };
 
     const u32 graphics_family{device.GetGraphicsFamily()};
     const u32 present_family{device.GetPresentFamily()};
@@ -173,8 +172,6 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
         swapchain_ci.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
         swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size());
         swapchain_ci.pQueueFamilyIndices = queue_indices.data();
-    } else {
-        swapchain_ci.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
     }
 
     // Request the size again to reduce the possibility of a TOCTOU race condition.
@@ -200,20 +197,29 @@ void VKSwapchain::CreateSemaphores() {
 }
 
 void VKSwapchain::CreateImageViews() {
-    VkImageViewCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    // ci.image
-    ci.viewType = VK_IMAGE_VIEW_TYPE_2D;
-    ci.format = image_format;
-    ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
-                     VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
-    ci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    ci.subresourceRange.baseMipLevel = 0;
-    ci.subresourceRange.levelCount = 1;
-    ci.subresourceRange.baseArrayLayer = 0;
-    ci.subresourceRange.layerCount = 1;
+    VkImageViewCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .image = {},
+        .viewType = VK_IMAGE_VIEW_TYPE_2D,
+        .format = image_format,
+        .components =
+            {
+                .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+            },
+        .subresourceRange =
+            {
+                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                .baseMipLevel = 0,
+                .levelCount = 1,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+    };
 
     image_views.resize(image_count);
     for (std::size_t i = 0; i < image_count; i++) {
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
index a35d61345..6b39befdf 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -16,11 +16,11 @@ struct FramebufferLayout;
 namespace Vulkan {
 
 class VKDevice;
-class VKFence;
+class VKScheduler;
 
 class VKSwapchain {
 public:
-    explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device);
+    explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device, VKScheduler& scheduler);
     ~VKSwapchain();
 
     /// Creates (or recreates) the swapchain with a given size.
@@ -31,7 +31,7 @@ public:
 
     /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be
     /// recreated. Takes responsability for the ownership of fence.
-    bool Present(VkSemaphore render_semaphore, VKFence& fence);
+    bool Present(VkSemaphore render_semaphore);
 
     /// Returns true when the framebuffer layout has changed.
     bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const;
@@ -74,6 +74,7 @@ private:
 
     const VkSurfaceKHR surface;
     const VKDevice& device;
+    VKScheduler& scheduler;
 
     vk::SwapchainKHR swapchain;
 
@@ -81,7 +82,7 @@ private:
     std::vector<VkImage> images;
     std::vector<vk::ImageView> image_views;
     std::vector<vk::Framebuffer> framebuffers;
-    std::vector<VKFence*> fences;
+    std::vector<u64> resource_ticks;
     std::vector<vk::Semaphore> present_semaphores;
 
     u32 image_index{};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index de4c23120..f2c8f2ae1 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -10,11 +10,9 @@
 #include <variant>
 #include <vector>
 
-#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/core.h"
-#include "core/memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
@@ -26,7 +24,6 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/surface.h"
-#include "video_core/textures/convert.h"
 
 namespace Vulkan {
 
@@ -98,17 +95,18 @@ VkImageViewType GetImageViewType(SurfaceTarget target) {
 vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
                         std::size_t host_memory_size) {
     // TODO(Rodrigo): Move texture buffer creation to the buffer cache
-    VkBufferCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.size = static_cast<VkDeviceSize>(host_memory_size);
-    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
-               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
-    return device.GetLogical().CreateBuffer(ci);
+    return device.GetLogical().CreateBuffer({
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = static_cast<VkDeviceSize>(host_memory_size),
+        .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+                 VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                 VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
 }
 
 VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device,
@@ -116,15 +114,16 @@ VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device,
                                                     std::size_t host_memory_size) {
     ASSERT(params.IsBuffer());
 
-    VkBufferViewCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.buffer = buffer;
-    ci.format = MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format;
-    ci.offset = 0;
-    ci.range = static_cast<VkDeviceSize>(host_memory_size);
-    return ci;
+    return {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .buffer = buffer,
+        .format =
+            MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format,
+        .offset = 0,
+        .range = static_cast<VkDeviceSize>(host_memory_size),
+    };
 }
 
 VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) {
@@ -133,23 +132,24 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
     const auto [format, attachable, storage] =
         MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format);
 
-    VkImageCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.imageType = SurfaceTargetToImage(params.target);
-    ci.format = format;
-    ci.mipLevels = params.num_levels;
-    ci.arrayLayers = static_cast<u32>(params.GetNumLayers());
-    ci.samples = VK_SAMPLE_COUNT_1_BIT;
-    ci.tiling = VK_IMAGE_TILING_OPTIMAL;
-    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    ci.queueFamilyIndexCount = 0;
-    ci.pQueueFamilyIndices = nullptr;
-    ci.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-
-    ci.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
-               VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    VkImageCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .imageType = SurfaceTargetToImage(params.target),
+        .format = format,
+        .extent = {},
+        .mipLevels = params.num_levels,
+        .arrayLayers = static_cast<u32>(params.GetNumLayers()),
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+    };
     if (attachable) {
         ci.usage |= params.IsPixelFormatZeta() ? VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT
                                                : VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
@@ -170,6 +170,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
         ci.extent = {params.width, params.height, 1};
         break;
     case SurfaceTarget::Texture3D:
+        ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
         ci.extent = {params.width, params.height, params.depth};
         break;
     case SurfaceTarget::TextureBuffer:
@@ -179,14 +180,18 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
     return ci;
 }
 
+u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
+                  Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace
 
-CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
-                             VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+CachedSurface::CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager,
                              VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
                              GPUVAddr gpu_addr, const SurfaceParams& params)
-    : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, system{system},
-      device{device}, resource_manager{resource_manager},
+    : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, device{device},
       memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} {
     if (params.IsBuffer()) {
         buffer = CreateBuffer(device, params, host_memory_size);
@@ -206,9 +211,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
     }
 
     // TODO(Rodrigo): Move this to a virtual function.
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
-        true);
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+    main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -227,7 +234,7 @@ void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) {
 void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
     UNIMPLEMENTED_IF(params.IsBuffer());
 
-    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
+    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) {
         LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed");
     }
 
@@ -256,12 +263,8 @@ void CachedSurface::DecorateSurfaceName() {
 }
 
 View CachedSurface::CreateView(const ViewParams& params) {
-    return CreateViewInner(params, false);
-}
-
-View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
     // TODO(Rodrigo): Add name decorations
-    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
 }
 
 void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -279,12 +282,10 @@ void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
         barrier.pNext = nullptr;
-        barrier.srcAccessMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
-        barrier.dstAccessMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT;
-        barrier.srcQueueFamilyIndex = VK_ACCESS_TRANSFER_WRITE_BIT;
-        barrier.dstQueueFamilyIndex = VK_ACCESS_SHADER_READ_BIT;
-        barrier.srcQueueFamilyIndex = 0;
-        barrier.dstQueueFamilyIndex = 0;
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; // They'll be ignored anyway
+        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.buffer = dst_buffer;
         barrier.offset = 0;
         barrier.size = size;
@@ -321,22 +322,25 @@ void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) {
 }
 
 VkBufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const {
-    VkBufferImageCopy copy;
-    copy.bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted);
-    copy.bufferRowLength = 0;
-    copy.bufferImageHeight = 0;
-    copy.imageSubresource.aspectMask = image->GetAspectMask();
-    copy.imageSubresource.mipLevel = level;
-    copy.imageSubresource.baseArrayLayer = 0;
-    copy.imageSubresource.layerCount = static_cast<u32>(params.GetNumLayers());
-    copy.imageOffset.x = 0;
-    copy.imageOffset.y = 0;
-    copy.imageOffset.z = 0;
-    copy.imageExtent.width = params.GetMipWidth(level);
-    copy.imageExtent.height = params.GetMipHeight(level);
-    copy.imageExtent.depth =
-        params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1;
-    return copy;
+    return {
+        .bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted),
+        .bufferRowLength = 0,
+        .bufferImageHeight = 0,
+        .imageSubresource =
+            {
+                .aspectMask = image->GetAspectMask(),
+                .mipLevel = level,
+                .baseArrayLayer = 0,
+                .layerCount = static_cast<u32>(params.GetNumLayers()),
+            },
+        .imageOffset = {.x = 0, .y = 0, .z = 0},
+        .imageExtent =
+            {
+                .width = params.GetMipWidth(level),
+                .height = params.GetMipHeight(level),
+                .depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1U,
+            },
+    };
 }
 
 VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
@@ -345,38 +349,44 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
 }
 
 CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                                     const ViewParams& params, bool is_proxy)
+                                     const ViewParams& params)
     : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
       image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
       aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
-      base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
-      num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
-                                                           : VK_IMAGE_VIEW_TYPE_1D} {}
+      base_level{params.base_level}, num_levels{params.num_levels},
+      image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        base_layer = 0;
+        num_layers = 1;
+        base_slice = params.base_layer;
+        num_slices = params.num_layers;
+    } else {
+        base_layer = params.base_layer;
+        num_layers = params.num_layers;
+    }
+}
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
-                                         SwizzleSource z_source, SwizzleSource w_source) {
-    const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (last_image_view && last_swizzle == swizzle) {
+VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
+                                            SwizzleSource z_source, SwizzleSource w_source) {
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (last_image_view && last_swizzle == new_swizzle) {
         return last_image_view;
     }
-    last_swizzle = swizzle;
+    last_swizzle = new_swizzle;
 
-    const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
     auto& image_view = entry->second;
     if (!is_cache_miss) {
         return last_image_view = *image_view;
     }
 
-    auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
-    auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
-    auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
-    auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
-
-    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
+    std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
+                       MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
+    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) {
         // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
-        std::swap(swizzle_x, swizzle_z);
+        std::swap(swizzle[0], swizzle[2]);
     }
 
     // Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -386,11 +396,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
         UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
         const bool is_first = x_source == SwizzleSource::R;
         switch (params.pixel_format) {
-        case VideoCore::Surface::PixelFormat::Z24S8:
-        case VideoCore::Surface::PixelFormat::Z32FS8:
+        case VideoCore::Surface::PixelFormat::D24_UNORM_S8_UINT:
+        case VideoCore::Surface::PixelFormat::D32_FLOAT_S8_UINT:
             aspect = is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT;
             break;
-        case VideoCore::Surface::PixelFormat::S8Z24:
+        case VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM:
             aspect = is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
             break;
         default:
@@ -398,44 +408,100 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
             UNIMPLEMENTED();
         }
 
-        // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
-        swizzle_x = VK_COMPONENT_SWIZZLE_R;
-        swizzle_y = VK_COMPONENT_SWIZZLE_G;
-        swizzle_z = VK_COMPONENT_SWIZZLE_B;
-        swizzle_w = VK_COMPONENT_SWIZZLE_A;
+        // Make sure we sample the first component
+        std::transform(
+            swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
+                return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
+            });
     }
 
-    VkImageViewCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.image = surface.GetImageHandle();
-    ci.viewType = image_view_type;
-    ci.format = surface.GetImage().GetFormat();
-    ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
-    ci.subresourceRange.aspectMask = aspect;
-    ci.subresourceRange.baseMipLevel = base_level;
-    ci.subresourceRange.levelCount = num_levels;
-    ci.subresourceRange.baseArrayLayer = base_layer;
-    ci.subresourceRange.layerCount = num_layers;
-    image_view = device.GetLogical().CreateImageView(ci);
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ASSERT(base_slice == 0);
+        ASSERT(num_slices == params.depth);
+    }
+
+    image_view = device.GetLogical().CreateImageView({
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .image = surface.GetImageHandle(),
+        .viewType = image_view_type,
+        .format = surface.GetImage().GetFormat(),
+        .components =
+            {
+                .r = swizzle[0],
+                .g = swizzle[1],
+                .b = swizzle[2],
+                .a = swizzle[3],
+            },
+        .subresourceRange =
+            {
+                .aspectMask = aspect,
+                .baseMipLevel = base_level,
+                .levelCount = num_levels,
+                .baseArrayLayer = base_layer,
+                .layerCount = num_layers,
+            },
+    });
 
     return last_image_view = *image_view;
 }
 
-VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                               const VKDevice& device, VKResourceManager& resource_manager,
-                               VKMemoryManager& memory_manager, VKScheduler& scheduler,
-                               VKStagingBufferPool& staging_pool)
-    : TextureCache(system, rasterizer, device.IsOptimalAstcSupported()), device{device},
-      resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler},
-      staging_pool{staging_pool} {}
+VkImageView CachedSurfaceView::GetAttachment() {
+    if (render_target) {
+        return *render_target;
+    }
+
+    VkImageViewCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .image = surface.GetImageHandle(),
+        .viewType = VK_IMAGE_VIEW_TYPE_1D,
+        .format = surface.GetImage().GetFormat(),
+        .components =
+            {
+                .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+                .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+            },
+        .subresourceRange =
+            {
+                .aspectMask = aspect_mask,
+                .baseMipLevel = base_level,
+                .levelCount = num_levels,
+                .baseArrayLayer = 0,
+                .layerCount = 0,
+            },
+    };
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
+        ci.subresourceRange.baseArrayLayer = base_slice;
+        ci.subresourceRange.layerCount = num_slices;
+    } else {
+        ci.viewType = image_view_type;
+        ci.subresourceRange.baseArrayLayer = base_layer;
+        ci.subresourceRange.layerCount = num_layers;
+    }
+    render_target = device.GetLogical().CreateImageView(ci);
+    return *render_target;
+}
+
+VKTextureCache::VKTextureCache(VideoCore::RasterizerInterface& rasterizer,
+                               Tegra::Engines::Maxwell3D& maxwell3d,
+                               Tegra::MemoryManager& gpu_memory, const VKDevice& device_,
+                               VKMemoryManager& memory_manager_, VKScheduler& scheduler_,
+                               VKStagingBufferPool& staging_pool_)
+    : TextureCache(rasterizer, maxwell3d, gpu_memory, device_.IsOptimalAstcSupported()),
+      device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{
+                                                                                   staging_pool_} {}
 
 VKTextureCache::~VKTextureCache() = default;
 
 Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) {
-    return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager,
-                                           scheduler, staging_pool, gpu_addr, params);
+    return std::make_shared<CachedSurface>(device, memory_manager, scheduler, staging_pool,
+                                           gpu_addr, params);
 }
 
 void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface,
@@ -462,24 +528,40 @@ void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface,
                             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
                             VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 
-    VkImageCopy copy;
-    copy.srcSubresource.aspectMask = src_surface->GetAspectMask();
-    copy.srcSubresource.mipLevel = copy_params.source_level;
-    copy.srcSubresource.baseArrayLayer = copy_params.source_z;
-    copy.srcSubresource.layerCount = num_layers;
-    copy.srcOffset.x = copy_params.source_x;
-    copy.srcOffset.y = copy_params.source_y;
-    copy.srcOffset.z = 0;
-    copy.dstSubresource.aspectMask = dst_surface->GetAspectMask();
-    copy.dstSubresource.mipLevel = copy_params.dest_level;
-    copy.dstSubresource.baseArrayLayer = dst_base_layer;
-    copy.dstSubresource.layerCount = num_layers;
-    copy.dstOffset.x = copy_params.dest_x;
-    copy.dstOffset.y = copy_params.dest_y;
-    copy.dstOffset.z = dst_offset_z;
-    copy.extent.width = copy_params.width;
-    copy.extent.height = copy_params.height;
-    copy.extent.depth = extent_z;
+    const VkImageCopy copy{
+        .srcSubresource =
+            {
+                .aspectMask = src_surface->GetAspectMask(),
+                .mipLevel = copy_params.source_level,
+                .baseArrayLayer = copy_params.source_z,
+                .layerCount = num_layers,
+            },
+        .srcOffset =
+            {
+                .x = static_cast<s32>(copy_params.source_x),
+                .y = static_cast<s32>(copy_params.source_y),
+                .z = 0,
+            },
+        .dstSubresource =
+            {
+                .aspectMask = dst_surface->GetAspectMask(),
+                .mipLevel = copy_params.dest_level,
+                .baseArrayLayer = dst_base_layer,
+                .layerCount = num_layers,
+            },
+        .dstOffset =
+            {
+                .x = static_cast<s32>(copy_params.dest_x),
+                .y = static_cast<s32>(copy_params.dest_y),
+                .z = static_cast<s32>(dst_offset_z),
+            },
+        .extent =
+            {
+                .width = copy_params.width,
+                .height = copy_params.height,
+                .depth = extent_z,
+            },
+    };
 
     const VkImage src_image = src_surface->GetImageHandle();
     const VkImage dst_image = dst_surface->GetImageHandle();
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 115595f28..39202feba 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -7,23 +7,13 @@
 #include <memory>
 #include <unordered_map>
 
-#include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
-#include "common/math_util.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/vk_image.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/texture_cache/surface_base.h"
 #include "video_core/texture_cache/texture_cache.h"
-#include "video_core/textures/decoders.h"
-
-namespace Core {
-class System;
-}
 
 namespace VideoCore {
 class RasterizerInterface;
@@ -33,7 +23,6 @@ namespace Vulkan {
 
 class RasterizerVulkan;
 class VKDevice;
-class VKResourceManager;
 class VKScheduler;
 class VKStagingBufferPool;
 
@@ -51,8 +40,7 @@ class CachedSurface final : public VideoCommon::SurfaceBase<View> {
     friend CachedSurfaceView;
 
 public:
-    explicit CachedSurface(Core::System& system, const VKDevice& device,
-                           VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+    explicit CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager,
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
                            GPUVAddr gpu_addr, const SurfaceParams& params);
     ~CachedSurface();
@@ -97,7 +85,6 @@ protected:
     void DecorateSurfaceName();
 
     View CreateView(const ViewParams& params) override;
-    View CreateViewInner(const ViewParams& params, bool is_proxy);
 
 private:
     void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -108,9 +95,7 @@ private:
 
     VkImageSubresourceRange GetImageSubresourceRange() const;
 
-    Core::System& system;
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKScheduler& scheduler;
     VKStagingBufferPool& staging_pool;
@@ -126,23 +111,20 @@ private:
 class CachedSurfaceView final : public VideoCommon::ViewBase {
 public:
     explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                               const ViewParams& params, bool is_proxy);
+                               const ViewParams& params);
     ~CachedSurfaceView();
 
-    VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
-                          Tegra::Texture::SwizzleSource y_source,
-                          Tegra::Texture::SwizzleSource z_source,
-                          Tegra::Texture::SwizzleSource w_source);
+    VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
+                             Tegra::Texture::SwizzleSource y_source,
+                             Tegra::Texture::SwizzleSource z_source,
+                             Tegra::Texture::SwizzleSource w_source);
+
+    VkImageView GetAttachment();
 
     bool IsSameSurface(const CachedSurfaceView& rhs) const {
         return &surface == &rhs.surface;
     }
 
-    VkImageView GetHandle() {
-        return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
-                         Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
-    }
-
     u32 GetWidth() const {
         return params.GetMipWidth(base_level);
     }
@@ -186,14 +168,6 @@ public:
     }
 
 private:
-    static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                             Tegra::Texture::SwizzleSource y_source,
-                             Tegra::Texture::SwizzleSource z_source,
-                             Tegra::Texture::SwizzleSource w_source) {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
     // Store a copy of these values to avoid double dereference when reading them
     const SurfaceParams params;
     const VkImage image;
@@ -202,24 +176,27 @@ private:
 
     const VKDevice& device;
     CachedSurface& surface;
-    const u32 base_layer;
-    const u32 num_layers;
     const u32 base_level;
     const u32 num_levels;
     const VkImageViewType image_view_type;
+    u32 base_layer = 0;
+    u32 num_layers = 0;
+    u32 base_slice = 0;
+    u32 num_slices = 0;
 
     VkImageView last_image_view = nullptr;
     u32 last_swizzle = 0;
 
+    vk::ImageView render_target;
     std::unordered_map<u32, vk::ImageView> view_cache;
 };
 
 class VKTextureCache final : public TextureCacheBase {
 public:
-    explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                            const VKDevice& device, VKResourceManager& resource_manager,
-                            VKMemoryManager& memory_manager, VKScheduler& scheduler,
-                            VKStagingBufferPool& staging_pool);
+    explicit VKTextureCache(VideoCore::RasterizerInterface& rasterizer,
+                            Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory,
+                            const VKDevice& device, VKMemoryManager& memory_manager,
+                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKTextureCache();
 
 private:
@@ -234,7 +211,6 @@ private:
     void BufferCopy(Surface& src_surface, Surface& dst_surface) override;
 
     const VKDevice& device;
-    VKResourceManager& resource_manager;
     VKMemoryManager& memory_manager;
     VKScheduler& scheduler;
     VKStagingBufferPool& staging_pool;
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 4bfec0077..351c048d2 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -24,34 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {
 }
 
 void VKUpdateDescriptorQueue::Acquire() {
-    entries.clear();
-}
+    // Minimum number of entries required.
+    // This is the maximum number of entries a single draw call migth use.
+    static constexpr std::size_t MIN_ENTRIES = 0x400;
 
-void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
-                                   VkDescriptorSet set) {
-    if (payload.size() + entries.size() >= payload.max_size()) {
+    if (payload.size() + MIN_ENTRIES >= payload.max_size()) {
         LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
         scheduler.WaitWorker();
         payload.clear();
     }
+    upload_start = &*payload.end();
+}
 
-    const auto payload_start = payload.data() + payload.size();
-    for (const auto& entry : entries) {
-        if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) {
-            payload.push_back(*image);
-        } else if (const auto buffer = std::get_if<Buffer>(&entry)) {
-            payload.emplace_back(*buffer->buffer, buffer->offset, buffer->size);
-        } else if (const auto texel = std::get_if<VkBufferView>(&entry)) {
-            payload.push_back(*texel);
-        } else {
-            UNREACHABLE();
-        }
-    }
-
-    scheduler.Record(
-        [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) {
-            logical->UpdateDescriptorSet(set, update_template, payload_start);
-        });
+void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
+                                   VkDescriptorSet set) {
+    const void* const data = upload_start;
+    const vk::Device* const logical = &device.GetLogical();
+    scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) {
+        logical->UpdateDescriptorSet(set, update_template, data);
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h
index a9e3d5dba..945320c72 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include <type_traits>
 #include <variant>
 #include <boost/container/static_vector.hpp>
 
@@ -16,18 +15,13 @@ namespace Vulkan {
 class VKDevice;
 class VKScheduler;
 
-class DescriptorUpdateEntry {
-public:
-    explicit DescriptorUpdateEntry() : image{} {}
-
-    DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {}
+struct DescriptorUpdateEntry {
+    DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {}
 
-    DescriptorUpdateEntry(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size)
-        : buffer{buffer, offset, size} {}
+    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {}
 
-    DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {}
+    DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {}
 
-private:
     union {
         VkDescriptorImageInfo image;
         VkDescriptorBufferInfo buffer;
@@ -47,37 +41,34 @@ public:
     void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);
 
     void AddSampledImage(VkSampler sampler, VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
     }
 
     void AddImage(VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
     }
 
-    void AddBuffer(const VkBuffer* buffer, u64 offset, std::size_t size) {
-        entries.push_back(Buffer{buffer, offset, size});
+    void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) {
+        payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
     }
 
     void AddTexelBuffer(VkBufferView texel_buffer) {
-        entries.emplace_back(texel_buffer);
+        payload.emplace_back(texel_buffer);
     }
 
-    VkImageLayout* GetLastImageLayout() {
-        return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout;
+    VkImageLayout* LastImageLayout() {
+        return &payload.back().image.imageLayout;
     }
 
-private:
-    struct Buffer {
-        const VkBuffer* buffer = nullptr;
-        u64 offset = 0;
-        std::size_t size = 0;
-    };
-    using Variant = std::variant<VkDescriptorImageInfo, Buffer, VkBufferView>;
+    const VkImageLayout* LastImageLayout() const {
+        return &payload.back().image.imageLayout;
+    }
 
+private:
     const VKDevice& device;
     VKScheduler& scheduler;
 
-    boost::container::static_vector<Variant, 0x400> entries;
+    const DescriptorUpdateEntry* upload_start = nullptr;
     boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;
 };
 
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 9b94dfff1..4e83303d8 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -2,13 +2,16 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <exception>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/logging/log.h"
 
 #include "video_core/renderer_vulkan/wrapper.h"
 
@@ -16,6 +19,44 @@ namespace Vulkan::vk {
 
 namespace {
 
+template <typename Func>
+void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld,
+                         Func&& func) {
+    // Calling GetProperties calls Vulkan more than needed. But they are supposed to be cheap
+    // functions.
+    std::stable_sort(devices.begin(), devices.end(),
+                     [&dld, &func](VkPhysicalDevice lhs, VkPhysicalDevice rhs) {
+                         return func(vk::PhysicalDevice(lhs, dld).GetProperties(),
+                                     vk::PhysicalDevice(rhs, dld).GetProperties());
+                     });
+}
+
+void SortPhysicalDevicesPerVendor(std::vector<VkPhysicalDevice>& devices,
+                                  const InstanceDispatch& dld,
+                                  std::initializer_list<u32> vendor_ids) {
+    for (auto it = vendor_ids.end(); it != vendor_ids.begin();) {
+        --it;
+        SortPhysicalDevices(devices, dld, [id = *it](const auto& lhs, const auto& rhs) {
+            return lhs.vendorID == id && rhs.vendorID != id;
+        });
+    }
+}
+
+void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) {
+    // Sort by name, this will set a base and make GPUs with higher numbers appear first
+    // (e.g. GTX 1650 will intentionally be listed before a GTX 1080).
+    SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) {
+        return std::string_view{lhs.deviceName} > std::string_view{rhs.deviceName};
+    });
+    // Prefer discrete over non-discrete
+    SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) {
+        return lhs.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU &&
+               rhs.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU;
+    });
+    // Prefer Nvidia over AMD, AMD over Intel, Intel over the rest.
+    SortPhysicalDevicesPerVendor(devices, dld, {0x10DE, 0x1002, 0x8086});
+}
+
 template <typename T>
 bool Proc(T& result, const InstanceDispatch& dld, const char* proc_name,
           VkInstance instance = nullptr) noexcept {
@@ -61,14 +102,25 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkCmdPipelineBarrier);
     X(vkCmdPushConstants);
     X(vkCmdSetBlendConstants);
-    X(vkCmdSetCheckpointNV);
     X(vkCmdSetDepthBias);
     X(vkCmdSetDepthBounds);
+    X(vkCmdSetEvent);
     X(vkCmdSetScissor);
     X(vkCmdSetStencilCompareMask);
     X(vkCmdSetStencilReference);
     X(vkCmdSetStencilWriteMask);
     X(vkCmdSetViewport);
+    X(vkCmdWaitEvents);
+    X(vkCmdBindVertexBuffers2EXT);
+    X(vkCmdSetCullModeEXT);
+    X(vkCmdSetDepthBoundsTestEnableEXT);
+    X(vkCmdSetDepthCompareOpEXT);
+    X(vkCmdSetDepthTestEnableEXT);
+    X(vkCmdSetDepthWriteEnableEXT);
+    X(vkCmdSetFrontFaceEXT);
+    X(vkCmdSetPrimitiveTopologyEXT);
+    X(vkCmdSetStencilOpEXT);
+    X(vkCmdSetStencilTestEnableEXT);
     X(vkCreateBuffer);
     X(vkCreateBufferView);
     X(vkCreateCommandPool);
@@ -76,6 +128,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkCreateDescriptorPool);
     X(vkCreateDescriptorSetLayout);
     X(vkCreateDescriptorUpdateTemplateKHR);
+    X(vkCreateEvent);
     X(vkCreateFence);
     X(vkCreateFramebuffer);
     X(vkCreateGraphicsPipelines);
@@ -94,6 +147,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkDestroyDescriptorPool);
     X(vkDestroyDescriptorSetLayout);
     X(vkDestroyDescriptorUpdateTemplateKHR);
+    X(vkDestroyEvent);
     X(vkDestroyFence);
     X(vkDestroyFramebuffer);
     X(vkDestroyImage);
@@ -113,10 +167,11 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkFreeMemory);
     X(vkGetBufferMemoryRequirements);
     X(vkGetDeviceQueue);
+    X(vkGetEventStatus);
     X(vkGetFenceStatus);
     X(vkGetImageMemoryRequirements);
     X(vkGetQueryPoolResults);
-    X(vkGetQueueCheckpointDataNV);
+    X(vkGetSemaphoreCounterValueKHR);
     X(vkMapMemory);
     X(vkQueueSubmit);
     X(vkResetFences);
@@ -125,6 +180,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkUpdateDescriptorSetWithTemplateKHR);
     X(vkUpdateDescriptorSets);
     X(vkWaitForFences);
+    X(vkWaitSemaphoresKHR);
 #undef X
 }
 
@@ -132,7 +188,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
 
 bool Load(InstanceDispatch& dld) noexcept {
 #define X(name) Proc(dld.name, dld, #name)
-    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+           X(vkEnumerateInstanceLayerProperties);
 #undef X
 }
 
@@ -230,6 +287,22 @@ const char* ToString(VkResult result) noexcept {
         return "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT";
     case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT:
         return "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT";
+    case VkResult::VK_ERROR_UNKNOWN:
+        return "VK_ERROR_UNKNOWN";
+    case VkResult::VK_ERROR_INCOMPATIBLE_VERSION_KHR:
+        return "VK_ERROR_INCOMPATIBLE_VERSION_KHR";
+    case VkResult::VK_THREAD_IDLE_KHR:
+        return "VK_THREAD_IDLE_KHR";
+    case VkResult::VK_THREAD_DONE_KHR:
+        return "VK_THREAD_DONE_KHR";
+    case VkResult::VK_OPERATION_DEFERRED_KHR:
+        return "VK_OPERATION_DEFERRED_KHR";
+    case VkResult::VK_OPERATION_NOT_DEFERRED_KHR:
+        return "VK_OPERATION_NOT_DEFERRED_KHR";
+    case VkResult::VK_PIPELINE_COMPILE_REQUIRED_EXT:
+        return "VK_PIPELINE_COMPILE_REQUIRED_EXT";
+    case VkResult::VK_RESULT_MAX_ENUM:
+        return "VK_RESULT_MAX_ENUM";
     }
     return "Unknown";
 }
@@ -271,6 +344,10 @@ void Destroy(VkDevice device, VkDeviceMemory handle, const DeviceDispatch& dld)
     dld.vkFreeMemory(device, handle, nullptr);
 }
 
+void Destroy(VkDevice device, VkEvent handle, const DeviceDispatch& dld) noexcept {
+    dld.vkDestroyEvent(device, handle, nullptr);
+}
+
 void Destroy(VkDevice device, VkFence handle, const DeviceDispatch& dld) noexcept {
     dld.vkDestroyFence(device, handle, nullptr);
 }
@@ -339,26 +416,27 @@ VkResult Free(VkDevice device, VkCommandPool handle, Span<VkCommandBuffer> buffe
     return VK_SUCCESS;
 }
 
-Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions,
+Instance Instance::Create(u32 version, Span<const char*> layers, Span<const char*> extensions,
                           InstanceDispatch& dld) noexcept {
-    VkApplicationInfo application_info;
-    application_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
-    application_info.pNext = nullptr;
-    application_info.pApplicationName = "yuzu Emulator";
-    application_info.applicationVersion = VK_MAKE_VERSION(0, 1, 0);
-    application_info.pEngineName = "yuzu Emulator";
-    application_info.engineVersion = VK_MAKE_VERSION(0, 1, 0);
-    application_info.apiVersion = VK_API_VERSION_1_1;
-
-    VkInstanceCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.pApplicationInfo = &application_info;
-    ci.enabledLayerCount = layers.size();
-    ci.ppEnabledLayerNames = layers.data();
-    ci.enabledExtensionCount = extensions.size();
-    ci.ppEnabledExtensionNames = extensions.data();
+    const VkApplicationInfo application_info{
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pNext = nullptr,
+        .pApplicationName = "yuzu Emulator",
+        .applicationVersion = VK_MAKE_VERSION(0, 1, 0),
+        .pEngineName = "yuzu Emulator",
+        .engineVersion = VK_MAKE_VERSION(0, 1, 0),
+        .apiVersion = version,
+    };
+    const VkInstanceCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .pApplicationInfo = &application_info,
+        .enabledLayerCount = layers.size(),
+        .ppEnabledLayerNames = layers.data(),
+        .enabledExtensionCount = extensions.size(),
+        .ppEnabledExtensionNames = extensions.data(),
+    };
 
     VkInstance instance;
     if (dld.vkCreateInstance(&ci, nullptr, &instance) != VK_SUCCESS) {
@@ -383,24 +461,26 @@ std::optional<std::vector<VkPhysicalDevice>> Instance::EnumeratePhysicalDevices(
     if (dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data()) != VK_SUCCESS) {
         return std::nullopt;
     }
-    return physical_devices;
+    SortPhysicalDevices(physical_devices, *dld);
+    return std::make_optional(std::move(physical_devices));
 }
 
 DebugCallback Instance::TryCreateDebugCallback(
     PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept {
-    VkDebugUtilsMessengerCreateInfoEXT ci;
-    ci.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
-    ci.pNext = nullptr;
-    ci.flags = 0;
-    ci.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT |
-                         VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
-                         VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
-                         VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT;
-    ci.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
-                     VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
-                     VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
-    ci.pfnUserCallback = callback;
-    ci.pUserData = nullptr;
+    const VkDebugUtilsMessengerCreateInfoEXT ci{
+        .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .flags = 0,
+        .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT,
+        .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                       VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                       VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+        .pfnUserCallback = callback,
+        .pUserData = nullptr,
+    };
 
     VkDebugUtilsMessengerEXT messenger;
     if (dld->vkCreateDebugUtilsMessengerEXT(handle, &ci, nullptr, &messenger) != VK_SUCCESS) {
@@ -409,17 +489,6 @@ DebugCallback Instance::TryCreateDebugCallback(
     return DebugCallback(messenger, handle, *dld);
 }
 
-std::vector<VkCheckpointDataNV> Queue::GetCheckpointDataNV(const DeviceDispatch& dld) const {
-    if (!dld.vkGetQueueCheckpointDataNV) {
-        return {};
-    }
-    u32 num;
-    dld.vkGetQueueCheckpointDataNV(queue, &num, nullptr);
-    std::vector<VkCheckpointDataNV> checkpoints(num);
-    dld.vkGetQueueCheckpointDataNV(queue, &num, checkpoints.data());
-    return checkpoints;
-}
-
 void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const {
     Check(dld->vkBindBufferMemory(owner, handle, memory, offset));
 }
@@ -442,12 +511,13 @@ DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) c
 }
 
 CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLevel level) const {
-    VkCommandBufferAllocateInfo ai;
-    ai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-    ai.pNext = nullptr;
-    ai.commandPool = handle;
-    ai.level = level;
-    ai.commandBufferCount = static_cast<u32>(num_buffers);
+    const VkCommandBufferAllocateInfo ai{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .commandPool = handle,
+        .level = level,
+        .commandBufferCount = static_cast<u32>(num_buffers),
+    };
 
     std::unique_ptr buffers = std::make_unique<VkCommandBuffer[]>(num_buffers);
     switch (const VkResult result = dld->vkAllocateCommandBuffers(owner, &ai, buffers.get())) {
@@ -469,20 +539,20 @@ std::vector<VkImage> SwapchainKHR::GetImages() const {
 }
 
 Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci,
-                      Span<const char*> enabled_extensions,
-                      const VkPhysicalDeviceFeatures2& enabled_features,
+                      Span<const char*> enabled_extensions, const void* next,
                       DeviceDispatch& dld) noexcept {
-    VkDeviceCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
-    ci.pNext = &enabled_features;
-    ci.flags = 0;
-    ci.queueCreateInfoCount = queues_ci.size();
-    ci.pQueueCreateInfos = queues_ci.data();
-    ci.enabledLayerCount = 0;
-    ci.ppEnabledLayerNames = nullptr;
-    ci.enabledExtensionCount = enabled_extensions.size();
-    ci.ppEnabledExtensionNames = enabled_extensions.data();
-    ci.pEnabledFeatures = nullptr;
+    const VkDeviceCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = next,
+        .flags = 0,
+        .queueCreateInfoCount = queues_ci.size(),
+        .pQueueCreateInfos = queues_ci.data(),
+        .enabledLayerCount = 0,
+        .ppEnabledLayerNames = nullptr,
+        .enabledExtensionCount = enabled_extensions.size(),
+        .ppEnabledExtensionNames = enabled_extensions.data(),
+        .pEnabledFeatures = nullptr,
+    };
 
     VkDevice device;
     if (dld.vkCreateDevice(physical_device, &ci, nullptr, &device) != VK_SUCCESS) {
@@ -523,11 +593,15 @@ ImageView Device::CreateImageView(const VkImageViewCreateInfo& ci) const {
 }
 
 Semaphore Device::CreateSemaphore() const {
-    VkSemaphoreCreateInfo ci;
-    ci.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-    ci.pNext = nullptr;
-    ci.flags = 0;
+    static constexpr VkSemaphoreCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+    };
+    return CreateSemaphore(ci);
+}
 
+Semaphore Device::CreateSemaphore(const VkSemaphoreCreateInfo& ci) const {
     VkSemaphore object;
     Check(dld->vkCreateSemaphore(handle, &ci, nullptr, &object));
     return Semaphore(object, handle, *dld);
@@ -613,6 +687,18 @@ ShaderModule Device::CreateShaderModule(const VkShaderModuleCreateInfo& ci) cons
     return ShaderModule(object, handle, *dld);
 }
 
+Event Device::CreateEvent() const {
+    static constexpr VkEventCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+    };
+
+    VkEvent object;
+    Check(dld->vkCreateEvent(handle, &ci, nullptr, &object));
+    return Event(object, handle, *dld);
+}
+
 SwapchainKHR Device::CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const {
     VkSwapchainKHR object;
     Check(dld->vkCreateSwapchainKHR(handle, &ci, nullptr, &object));
@@ -701,8 +787,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s
     return supported == VK_TRUE;
 }
 
-VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const
-    noexcept {
+VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {
     VkSurfaceCapabilitiesKHR capabilities;
     Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));
     return capabilities;
@@ -733,6 +818,21 @@ VkPhysicalDeviceMemoryProperties PhysicalDevice::GetMemoryProperties() const noe
     return properties;
 }
 
+u32 AvailableVersion(const InstanceDispatch& dld) noexcept {
+    PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion;
+    if (!Proc(vkEnumerateInstanceVersion, dld, "vkEnumerateInstanceVersion")) {
+        // If the procedure is not found, Vulkan 1.0 is assumed
+        return VK_API_VERSION_1_0;
+    }
+    u32 version;
+    if (const VkResult result = vkEnumerateInstanceVersion(&version); result != VK_SUCCESS) {
+        LOG_ERROR(Render_Vulkan, "vkEnumerateInstanceVersion returned {}, assuming Vulkan 1.1",
+                  ToString(result));
+        return VK_API_VERSION_1_1;
+    }
+    return version;
+}
+
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld) {
     u32 num;
@@ -747,4 +847,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
     return properties;
 }
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld) {
+    u32 num;
+    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    std::vector<VkLayerProperties> properties(num);
+    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    return properties;
+}
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index fb3657819..f64919623 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
     PFN_vkCreateInstance vkCreateInstance;
     PFN_vkDestroyInstance vkDestroyInstance;
     PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
 
     PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
     PFN_vkCreateDevice vkCreateDevice;
@@ -197,14 +198,25 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier;
     PFN_vkCmdPushConstants vkCmdPushConstants;
     PFN_vkCmdSetBlendConstants vkCmdSetBlendConstants;
-    PFN_vkCmdSetCheckpointNV vkCmdSetCheckpointNV;
     PFN_vkCmdSetDepthBias vkCmdSetDepthBias;
     PFN_vkCmdSetDepthBounds vkCmdSetDepthBounds;
+    PFN_vkCmdSetEvent vkCmdSetEvent;
     PFN_vkCmdSetScissor vkCmdSetScissor;
     PFN_vkCmdSetStencilCompareMask vkCmdSetStencilCompareMask;
     PFN_vkCmdSetStencilReference vkCmdSetStencilReference;
     PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask;
     PFN_vkCmdSetViewport vkCmdSetViewport;
+    PFN_vkCmdWaitEvents vkCmdWaitEvents;
+    PFN_vkCmdBindVertexBuffers2EXT vkCmdBindVertexBuffers2EXT;
+    PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT;
+    PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT;
+    PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT;
+    PFN_vkCmdSetDepthTestEnableEXT vkCmdSetDepthTestEnableEXT;
+    PFN_vkCmdSetDepthWriteEnableEXT vkCmdSetDepthWriteEnableEXT;
+    PFN_vkCmdSetFrontFaceEXT vkCmdSetFrontFaceEXT;
+    PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT;
+    PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT;
+    PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT;
     PFN_vkCreateBuffer vkCreateBuffer;
     PFN_vkCreateBufferView vkCreateBufferView;
     PFN_vkCreateCommandPool vkCreateCommandPool;
@@ -212,6 +224,7 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkCreateDescriptorPool vkCreateDescriptorPool;
     PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout;
     PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkCreateEvent vkCreateEvent;
     PFN_vkCreateFence vkCreateFence;
     PFN_vkCreateFramebuffer vkCreateFramebuffer;
     PFN_vkCreateGraphicsPipelines vkCreateGraphicsPipelines;
@@ -230,6 +243,7 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool;
     PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout;
     PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyEvent vkDestroyEvent;
     PFN_vkDestroyFence vkDestroyFence;
     PFN_vkDestroyFramebuffer vkDestroyFramebuffer;
     PFN_vkDestroyImage vkDestroyImage;
@@ -249,10 +263,11 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkFreeMemory vkFreeMemory;
     PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements;
     PFN_vkGetDeviceQueue vkGetDeviceQueue;
+    PFN_vkGetEventStatus vkGetEventStatus;
     PFN_vkGetFenceStatus vkGetFenceStatus;
     PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements;
     PFN_vkGetQueryPoolResults vkGetQueryPoolResults;
-    PFN_vkGetQueueCheckpointDataNV vkGetQueueCheckpointDataNV;
+    PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR;
     PFN_vkMapMemory vkMapMemory;
     PFN_vkQueueSubmit vkQueueSubmit;
     PFN_vkResetFences vkResetFences;
@@ -261,6 +276,7 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
     PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets;
     PFN_vkWaitForFences vkWaitForFences;
+    PFN_vkWaitSemaphoresKHR vkWaitSemaphoresKHR;
 };
 
 /// Loads instance agnostic function pointers.
@@ -281,6 +297,7 @@ void Destroy(VkDevice, VkDescriptorPool, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkDescriptorSetLayout, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkDescriptorUpdateTemplateKHR, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkDeviceMemory, const DeviceDispatch&) noexcept;
+void Destroy(VkDevice, VkEvent, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkFence, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkFramebuffer, const DeviceDispatch&) noexcept;
 void Destroy(VkDevice, VkImage, const DeviceDispatch&) noexcept;
@@ -535,7 +552,6 @@ using PipelineLayout = Handle<VkPipelineLayout, VkDevice, DeviceDispatch>;
 using QueryPool = Handle<VkQueryPool, VkDevice, DeviceDispatch>;
 using RenderPass = Handle<VkRenderPass, VkDevice, DeviceDispatch>;
 using Sampler = Handle<VkSampler, VkDevice, DeviceDispatch>;
-using Semaphore = Handle<VkSemaphore, VkDevice, DeviceDispatch>;
 using ShaderModule = Handle<VkShaderModule, VkDevice, DeviceDispatch>;
 using SurfaceKHR = Handle<VkSurfaceKHR, VkInstance, InstanceDispatch>;
 
@@ -548,7 +564,7 @@ class Instance : public Handle<VkInstance, NoOwner, InstanceDispatch> {
 
 public:
     /// Creates a Vulkan instance. Use "operator bool" for error handling.
-    static Instance Create(Span<const char*> layers, Span<const char*> extensions,
+    static Instance Create(u32 version, Span<const char*> layers, Span<const char*> extensions,
                            InstanceDispatch& dld) noexcept;
 
     /// Enumerates physical devices.
@@ -567,12 +583,9 @@ public:
     /// Construct a queue handle.
     constexpr Queue(VkQueue queue, const DeviceDispatch& dld) noexcept : queue{queue}, dld{&dld} {}
 
-    /// Returns the checkpoint data.
-    /// @note Returns an empty vector when the function pointer is not present.
-    std::vector<VkCheckpointDataNV> GetCheckpointDataNV(const DeviceDispatch& dld) const;
-
-    void Submit(Span<VkSubmitInfo> submit_infos, VkFence fence) const {
-        Check(dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence));
+    VkResult Submit(Span<VkSubmitInfo> submit_infos,
+                    VkFence fence = VK_NULL_HANDLE) const noexcept {
+        return dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence);
     }
 
     VkResult Present(const VkPresentInfoKHR& present_info) const noexcept {
@@ -654,13 +667,59 @@ public:
     std::vector<VkImage> GetImages() const;
 };
 
+class Event : public Handle<VkEvent, VkDevice, DeviceDispatch> {
+    using Handle<VkEvent, VkDevice, DeviceDispatch>::Handle;
+
+public:
+    VkResult GetStatus() const noexcept {
+        return dld->vkGetEventStatus(owner, handle);
+    }
+};
+
+class Semaphore : public Handle<VkSemaphore, VkDevice, DeviceDispatch> {
+    using Handle<VkSemaphore, VkDevice, DeviceDispatch>::Handle;
+
+public:
+    [[nodiscard]] u64 GetCounter() const {
+        u64 value;
+        Check(dld->vkGetSemaphoreCounterValueKHR(owner, handle, &value));
+        return value;
+    }
+
+    /**
+     * Waits for a timeline semaphore on the host.
+     *
+     * @param value   Value to wait
+     * @param timeout Time in nanoseconds to timeout
+     * @return        True on successful wait, false on timeout
+     */
+    bool Wait(u64 value, u64 timeout = std::numeric_limits<u64>::max()) const {
+        const VkSemaphoreWaitInfoKHR wait_info{
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR,
+            .pNext = nullptr,
+            .flags = 0,
+            .semaphoreCount = 1,
+            .pSemaphores = &handle,
+            .pValues = &value,
+        };
+        const VkResult result = dld->vkWaitSemaphoresKHR(owner, &wait_info, timeout);
+        switch (result) {
+        case VK_SUCCESS:
+            return true;
+        case VK_TIMEOUT:
+            return false;
+        default:
+            throw Exception(result);
+        }
+    }
+};
+
 class Device : public Handle<VkDevice, NoOwner, DeviceDispatch> {
     using Handle<VkDevice, NoOwner, DeviceDispatch>::Handle;
 
 public:
     static Device Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci,
-                         Span<const char*> enabled_extensions,
-                         const VkPhysicalDeviceFeatures2& enabled_features,
+                         Span<const char*> enabled_extensions, const void* next,
                          DeviceDispatch& dld) noexcept;
 
     Queue GetQueue(u32 family_index) const noexcept;
@@ -675,6 +734,8 @@ public:
 
     Semaphore CreateSemaphore() const;
 
+    Semaphore CreateSemaphore(const VkSemaphoreCreateInfo& ci) const;
+
     Fence CreateFence(const VkFenceCreateInfo& ci) const;
 
     DescriptorPool CreateDescriptorPool(const VkDescriptorPoolCreateInfo& ci) const;
@@ -702,6 +763,8 @@ public:
 
     ShaderModule CreateShaderModule(const VkShaderModuleCreateInfo& ci) const;
 
+    Event CreateEvent() const;
+
     SwapchainKHR CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const;
 
     DeviceMemory TryAllocateMemory(const VkMemoryAllocateInfo& ai) const noexcept;
@@ -734,18 +797,11 @@ public:
         dld->vkResetQueryPoolEXT(handle, query_pool, first, count);
     }
 
-    void GetQueryResults(VkQueryPool query_pool, u32 first, u32 count, std::size_t data_size,
-                         void* data, VkDeviceSize stride, VkQueryResultFlags flags) const {
-        Check(dld->vkGetQueryPoolResults(handle, query_pool, first, count, data_size, data, stride,
-                                         flags));
-    }
-
-    template <typename T>
-    T GetQueryResult(VkQueryPool query_pool, u32 first, VkQueryResultFlags flags) const {
-        static_assert(std::is_trivially_copyable_v<T>);
-        T value;
-        GetQueryResults(query_pool, first, 1, sizeof(T), &value, sizeof(T), flags);
-        return value;
+    VkResult GetQueryResults(VkQueryPool query_pool, u32 first, u32 count, std::size_t data_size,
+                             void* data, VkDeviceSize stride,
+                             VkQueryResultFlags flags) const noexcept {
+        return dld->vkGetQueryPoolResults(handle, query_pool, first, count, data_size, data, stride,
+                                          flags);
     }
 };
 
@@ -776,7 +832,7 @@ public:
 
     bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const;
 
-    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept;
+    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;
 
     std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const;
 
@@ -835,8 +891,8 @@ public:
         dld->vkCmdBindPipeline(handle, bind_point, pipeline);
     }
 
-    void BindIndexBuffer(VkBuffer buffer, VkDeviceSize offset, VkIndexType index_type) const
-        noexcept {
+    void BindIndexBuffer(VkBuffer buffer, VkDeviceSize offset,
+                         VkIndexType index_type) const noexcept {
         dld->vkCmdBindIndexBuffer(handle, buffer, offset, index_type);
     }
 
@@ -849,8 +905,8 @@ public:
         BindVertexBuffers(binding, 1, &buffer, &offset);
     }
 
-    void Draw(u32 vertex_count, u32 instance_count, u32 first_vertex, u32 first_instance) const
-        noexcept {
+    void Draw(u32 vertex_count, u32 instance_count, u32 first_vertex,
+              u32 first_instance) const noexcept {
         dld->vkCmdDraw(handle, vertex_count, instance_count, first_vertex, first_instance);
     }
 
@@ -860,15 +916,15 @@ public:
                               first_instance);
     }
 
-    void ClearAttachments(Span<VkClearAttachment> attachments, Span<VkClearRect> rects) const
-        noexcept {
+    void ClearAttachments(Span<VkClearAttachment> attachments,
+                          Span<VkClearRect> rects) const noexcept {
         dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(),
                                    rects.data());
     }
 
     void BlitImage(VkImage src_image, VkImageLayout src_layout, VkImage dst_image,
-                   VkImageLayout dst_layout, Span<VkImageBlit> regions, VkFilter filter) const
-        noexcept {
+                   VkImageLayout dst_layout, Span<VkImageBlit> regions,
+                   VkFilter filter) const noexcept {
         dld->vkCmdBlitImage(handle, src_image, src_layout, dst_image, dst_layout, regions.size(),
                             regions.data(), filter);
     }
@@ -893,8 +949,8 @@ public:
                                     regions.data());
     }
 
-    void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, Span<VkBufferCopy> regions) const
-        noexcept {
+    void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
+                    Span<VkBufferCopy> regions) const noexcept {
         dld->vkCmdCopyBuffer(handle, src_buffer, dst_buffer, regions.size(), regions.data());
     }
 
@@ -910,8 +966,8 @@ public:
                                     regions.data());
     }
 
-    void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, u32 data) const
-        noexcept {
+    void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size,
+                    u32 data) const noexcept {
         dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data);
     }
 
@@ -920,10 +976,6 @@ public:
         dld->vkCmdPushConstants(handle, layout, flags, offset, size, values);
     }
 
-    void SetCheckpointNV(const void* checkpoint_marker) const noexcept {
-        dld->vkCmdSetCheckpointNV(handle, checkpoint_marker);
-    }
-
     void SetViewport(u32 first, Span<VkViewport> viewports) const noexcept {
         dld->vkCmdSetViewport(handle, first, viewports.size(), viewports.data());
     }
@@ -956,6 +1008,63 @@ public:
         dld->vkCmdSetDepthBounds(handle, min_depth_bounds, max_depth_bounds);
     }
 
+    void SetEvent(VkEvent event, VkPipelineStageFlags stage_flags) const noexcept {
+        dld->vkCmdSetEvent(handle, event, stage_flags);
+    }
+
+    void WaitEvents(Span<VkEvent> events, VkPipelineStageFlags src_stage_mask,
+                    VkPipelineStageFlags dst_stage_mask, Span<VkMemoryBarrier> memory_barriers,
+                    Span<VkBufferMemoryBarrier> buffer_barriers,
+                    Span<VkImageMemoryBarrier> image_barriers) const noexcept {
+        dld->vkCmdWaitEvents(handle, events.size(), events.data(), src_stage_mask, dst_stage_mask,
+                             memory_barriers.size(), memory_barriers.data(), buffer_barriers.size(),
+                             buffer_barriers.data(), image_barriers.size(), image_barriers.data());
+    }
+
+    void BindVertexBuffers2EXT(u32 first_binding, u32 binding_count, const VkBuffer* buffers,
+                               const VkDeviceSize* offsets, const VkDeviceSize* sizes,
+                               const VkDeviceSize* strides) const noexcept {
+        dld->vkCmdBindVertexBuffers2EXT(handle, first_binding, binding_count, buffers, offsets,
+                                        sizes, strides);
+    }
+
+    void SetCullModeEXT(VkCullModeFlags cull_mode) const noexcept {
+        dld->vkCmdSetCullModeEXT(handle, cull_mode);
+    }
+
+    void SetDepthBoundsTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthBoundsTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetDepthCompareOpEXT(VkCompareOp compare_op) const noexcept {
+        dld->vkCmdSetDepthCompareOpEXT(handle, compare_op);
+    }
+
+    void SetDepthTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetDepthWriteEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthWriteEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetFrontFaceEXT(VkFrontFace front_face) const noexcept {
+        dld->vkCmdSetFrontFaceEXT(handle, front_face);
+    }
+
+    void SetPrimitiveTopologyEXT(VkPrimitiveTopology primitive_topology) const noexcept {
+        dld->vkCmdSetPrimitiveTopologyEXT(handle, primitive_topology);
+    }
+
+    void SetStencilOpEXT(VkStencilFaceFlags face_mask, VkStencilOp fail_op, VkStencilOp pass_op,
+                         VkStencilOp depth_fail_op, VkCompareOp compare_op) const noexcept {
+        dld->vkCmdSetStencilOpEXT(handle, face_mask, fail_op, pass_op, depth_fail_op, compare_op);
+    }
+
+    void SetStencilTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetStencilTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
     void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers,
                                          const VkDeviceSize* offsets,
                                          const VkDeviceSize* sizes) const noexcept {
@@ -981,7 +1090,12 @@ private:
     const DeviceDispatch* dld;
 };
 
+u32 AvailableVersion(const InstanceDispatch& dld) noexcept;
+
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld);
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h
index cca13bcde..8e5a22ab3 100644
--- a/src/video_core/shader/ast.h
+++ b/src/video_core/shader/ast.h
@@ -199,55 +199,48 @@ public:
     }
 
     std::optional<u32> GetGotoLabel() const {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTGoto>(&data)) {
             return {inner->label};
         }
-        return {};
+        return std::nullopt;
     }
 
     Expr GetGotoCondition() const {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTGoto>(&data)) {
             return inner->condition;
         }
         return nullptr;
     }
 
     void MarkLabelUnused() {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (auto* inner = std::get_if<ASTLabel>(&data)) {
             inner->unused = true;
         }
     }
 
     bool IsLabelUnused() const {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTLabel>(&data)) {
             return inner->unused;
         }
         return true;
     }
 
     std::optional<u32> GetLabelIndex() const {
-        auto inner = std::get_if<ASTLabel>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTLabel>(&data)) {
             return {inner->index};
         }
-        return {};
+        return std::nullopt;
     }
 
     Expr GetIfCondition() const {
-        auto inner = std::get_if<ASTIfThen>(&data);
-        if (inner) {
+        if (const auto* inner = std::get_if<ASTIfThen>(&data)) {
             return inner->condition;
         }
         return nullptr;
     }
 
     void SetGotoCondition(Expr new_condition) {
-        auto inner = std::get_if<ASTGoto>(&data);
-        if (inner) {
+        if (auto* inner = std::get_if<ASTGoto>(&data)) {
             inner->condition = std::move(new_condition);
         }
     }
diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp
new file mode 100644
index 000000000..6920afdf2
--- /dev/null
+++ b/src/video_core/shader/async_shaders.cpp
@@ -0,0 +1,216 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_base.h"
+#include "video_core/renderer_opengl/gl_shader_cache.h"
+#include "video_core/shader/async_shaders.h"
+
+namespace VideoCommon::Shader {
+
+AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window) : emu_window(emu_window) {}
+
+AsyncShaders::~AsyncShaders() {
+    KillWorkers();
+}
+
+void AsyncShaders::AllocateWorkers() {
+    // Use at least one thread
+    u32 num_workers = 1;
+
+    // Deduce how many more threads we can use
+    const u32 thread_count = std::thread::hardware_concurrency();
+    if (thread_count >= 8) {
+        // Increase async workers by 1 for every 2 threads >= 8
+        num_workers += 1 + (thread_count - 8) / 2;
+    }
+
+    // If we already have workers queued, ignore
+    if (num_workers == worker_threads.size()) {
+        return;
+    }
+
+    // If workers already exist, clear them
+    if (!worker_threads.empty()) {
+        FreeWorkers();
+    }
+
+    // Create workers
+    for (std::size_t i = 0; i < num_workers; i++) {
+        context_list.push_back(emu_window.CreateSharedContext());
+        worker_threads.emplace_back(&AsyncShaders::ShaderCompilerThread, this,
+                                    context_list[i].get());
+    }
+}
+
+void AsyncShaders::FreeWorkers() {
+    // Mark all threads to quit
+    is_thread_exiting.store(true);
+    cv.notify_all();
+    for (auto& thread : worker_threads) {
+        thread.join();
+    }
+    // Clear our shared contexts
+    context_list.clear();
+
+    // Clear our worker threads
+    worker_threads.clear();
+}
+
+void AsyncShaders::KillWorkers() {
+    is_thread_exiting.store(true);
+    for (auto& thread : worker_threads) {
+        thread.detach();
+    }
+    // Clear our shared contexts
+    context_list.clear();
+
+    // Clear our worker threads
+    worker_threads.clear();
+}
+
+bool AsyncShaders::HasWorkQueued() const {
+    return !pending_queue.empty();
+}
+
+bool AsyncShaders::HasCompletedWork() const {
+    std::shared_lock lock{completed_mutex};
+    return !finished_work.empty();
+}
+
+bool AsyncShaders::IsShaderAsync(const Tegra::GPU& gpu) const {
+    const auto& regs = gpu.Maxwell3D().regs;
+
+    // If something is using depth, we can assume that games are not rendering anything which will
+    // be used one time.
+    if (regs.zeta_enable) {
+        return true;
+    }
+
+    // If games are using a small index count, we can assume these are full screen quads. Usually
+    // these shaders are only used once for building textures so we can assume they can't be built
+    // async
+    if (regs.index_array.count <= 6 || regs.vertex_buffer.count <= 6) {
+        return false;
+    }
+
+    return true;
+}
+
+std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() {
+    std::vector<Result> results;
+    {
+        std::unique_lock lock{completed_mutex};
+        results = std::move(finished_work);
+        finished_work.clear();
+    }
+    return results;
+}
+
+void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device,
+                                     Tegra::Engines::ShaderType shader_type, u64 uid,
+                                     std::vector<u64> code, std::vector<u64> code_b,
+                                     u32 main_offset, CompilerSettings compiler_settings,
+                                     const Registry& registry, VAddr cpu_addr) {
+    std::unique_lock lock(queue_mutex);
+    pending_queue.push({
+        .backend = device.UseAssemblyShaders() ? Backend::GLASM : Backend::OpenGL,
+        .device = &device,
+        .shader_type = shader_type,
+        .uid = uid,
+        .code = std::move(code),
+        .code_b = std::move(code_b),
+        .main_offset = main_offset,
+        .compiler_settings = compiler_settings,
+        .registry = registry,
+        .cpu_address = cpu_addr,
+    });
+    cv.notify_one();
+}
+
+void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache,
+                                     const Vulkan::VKDevice& device, Vulkan::VKScheduler& scheduler,
+                                     Vulkan::VKDescriptorPool& descriptor_pool,
+                                     Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue,
+                                     Vulkan::VKRenderPassCache& renderpass_cache,
+                                     std::vector<VkDescriptorSetLayoutBinding> bindings,
+                                     Vulkan::SPIRVProgram program,
+                                     Vulkan::GraphicsPipelineCacheKey key) {
+    std::unique_lock lock(queue_mutex);
+    pending_queue.push({
+        .backend = Backend::Vulkan,
+        .pp_cache = pp_cache,
+        .vk_device = &device,
+        .scheduler = &scheduler,
+        .descriptor_pool = &descriptor_pool,
+        .update_descriptor_queue = &update_descriptor_queue,
+        .renderpass_cache = &renderpass_cache,
+        .bindings = std::move(bindings),
+        .program = std::move(program),
+        .key = key,
+    });
+    cv.notify_one();
+}
+
+void AsyncShaders::ShaderCompilerThread(Core::Frontend::GraphicsContext* context) {
+    while (!is_thread_exiting.load(std::memory_order_relaxed)) {
+        std::unique_lock lock{queue_mutex};
+        cv.wait(lock, [this] { return HasWorkQueued() || is_thread_exiting; });
+        if (is_thread_exiting) {
+            return;
+        }
+
+        // Partial lock to allow all threads to read at the same time
+        if (!HasWorkQueued()) {
+            continue;
+        }
+        // Another thread beat us, just unlock and wait for the next load
+        if (pending_queue.empty()) {
+            continue;
+        }
+
+        // Pull work from queue
+        WorkerParams work = std::move(pending_queue.front());
+        pending_queue.pop();
+        lock.unlock();
+
+        if (work.backend == Backend::OpenGL || work.backend == Backend::GLASM) {
+            const ShaderIR ir(work.code, work.main_offset, work.compiler_settings, *work.registry);
+            const auto scope = context->Acquire();
+            auto program =
+                OpenGL::BuildShader(*work.device, work.shader_type, work.uid, ir, *work.registry);
+            Result result{};
+            result.backend = work.backend;
+            result.cpu_address = work.cpu_address;
+            result.uid = work.uid;
+            result.code = std::move(work.code);
+            result.code_b = std::move(work.code_b);
+            result.shader_type = work.shader_type;
+
+            if (work.backend == Backend::OpenGL) {
+                result.program.opengl = std::move(program->source_program);
+            } else if (work.backend == Backend::GLASM) {
+                result.program.glasm = std::move(program->assembly_program);
+            }
+
+            {
+                std::unique_lock complete_lock(completed_mutex);
+                finished_work.push_back(std::move(result));
+            }
+        } else if (work.backend == Backend::Vulkan) {
+            auto pipeline = std::make_unique<Vulkan::VKGraphicsPipeline>(
+                *work.vk_device, *work.scheduler, *work.descriptor_pool,
+                *work.update_descriptor_queue, *work.renderpass_cache, work.key, work.bindings,
+                work.program);
+
+            work.pp_cache->EmplacePipeline(std::move(pipeline));
+        }
+    }
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h
new file mode 100644
index 000000000..7a99e1dc5
--- /dev/null
+++ b/src/video_core/shader/async_shaders.h
@@ -0,0 +1,147 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <shared_mutex>
+#include <thread>
+
+// This header includes both Vulkan and OpenGL headers, this has to be fixed
+// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
+// Forcefully include glad early and undefine macros
+#include <glad/glad.h>
+#ifdef CreateEvent
+#undef CreateEvent
+#endif
+#ifdef CreateSemaphore
+#undef CreateSemaphore
+#endif
+
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace Core::Frontend {
+class EmuWindow;
+class GraphicsContext;
+} // namespace Core::Frontend
+
+namespace Tegra {
+class GPU;
+}
+
+namespace Vulkan {
+class VKPipelineCache;
+}
+
+namespace VideoCommon::Shader {
+
+class AsyncShaders {
+public:
+    enum class Backend {
+        OpenGL,
+        GLASM,
+        Vulkan,
+    };
+
+    struct ResultPrograms {
+        OpenGL::OGLProgram opengl;
+        OpenGL::OGLAssemblyProgram glasm;
+    };
+
+    struct Result {
+        u64 uid;
+        VAddr cpu_address;
+        Backend backend;
+        ResultPrograms program;
+        std::vector<u64> code;
+        std::vector<u64> code_b;
+        Tegra::Engines::ShaderType shader_type;
+    };
+
+    explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window);
+    ~AsyncShaders();
+
+    /// Start up shader worker threads
+    void AllocateWorkers();
+
+    /// Clear the shader queue and kill all worker threads
+    void FreeWorkers();
+
+    // Force end all threads
+    void KillWorkers();
+
+    /// Check to see if any shaders have actually been compiled
+    [[nodiscard]] bool HasCompletedWork() const;
+
+    /// Deduce if a shader can be build on another thread of MUST be built in sync. We cannot build
+    /// every shader async as some shaders are only built and executed once. We try to "guess" which
+    /// shader would be used only once
+    [[nodiscard]] bool IsShaderAsync(const Tegra::GPU& gpu) const;
+
+    /// Pulls completed compiled shaders
+    [[nodiscard]] std::vector<Result> GetCompletedWork();
+
+    void QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type,
+                           u64 uid, std::vector<u64> code, std::vector<u64> code_b, u32 main_offset,
+                           CompilerSettings compiler_settings, const Registry& registry,
+                           VAddr cpu_addr);
+
+    void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::VKDevice& device,
+                           Vulkan::VKScheduler& scheduler,
+                           Vulkan::VKDescriptorPool& descriptor_pool,
+                           Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue,
+                           Vulkan::VKRenderPassCache& renderpass_cache,
+                           std::vector<VkDescriptorSetLayoutBinding> bindings,
+                           Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key);
+
+private:
+    void ShaderCompilerThread(Core::Frontend::GraphicsContext* context);
+
+    /// Check our worker queue to see if we have any work queued already
+    [[nodiscard]] bool HasWorkQueued() const;
+
+    struct WorkerParams {
+        Backend backend;
+        // For OGL
+        const OpenGL::Device* device;
+        Tegra::Engines::ShaderType shader_type;
+        u64 uid;
+        std::vector<u64> code;
+        std::vector<u64> code_b;
+        u32 main_offset;
+        CompilerSettings compiler_settings;
+        std::optional<Registry> registry;
+        VAddr cpu_address;
+
+        // For Vulkan
+        Vulkan::VKPipelineCache* pp_cache;
+        const Vulkan::VKDevice* vk_device;
+        Vulkan::VKScheduler* scheduler;
+        Vulkan::VKDescriptorPool* descriptor_pool;
+        Vulkan::VKUpdateDescriptorQueue* update_descriptor_queue;
+        Vulkan::VKRenderPassCache* renderpass_cache;
+        std::vector<VkDescriptorSetLayoutBinding> bindings;
+        Vulkan::SPIRVProgram program;
+        Vulkan::GraphicsPipelineCacheKey key;
+    };
+
+    std::condition_variable cv;
+    mutable std::mutex queue_mutex;
+    mutable std::shared_mutex completed_mutex;
+    std::atomic<bool> is_thread_exiting{};
+    std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> context_list;
+    std::vector<std::thread> worker_threads;
+    std::queue<WorkerParams> pending_queue;
+    std::vector<Result> finished_work;
+    Core::Frontend::EmuWindow& emu_window;
+};
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index 2e2711350..4c8971615 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -115,17 +116,6 @@ Pred GetPredicate(u32 index, bool negated) {
     return static_cast<Pred>(static_cast<u64>(index) + (negated ? 8ULL : 0ULL));
 }
 
-/**
- * Returns whether the instruction at the specified offset is a 'sched' instruction.
- * Sched instructions always appear before a sequence of 3 instructions.
- */
-constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
-    constexpr u32 SchedPeriod = 4;
-    u32 absolute_offset = offset - main_offset;
-
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
 enum class ParseResult : u32 {
     ControlCaught,
     BlockEnd,
@@ -197,24 +187,26 @@ std::optional<std::pair<BufferInfo, u64>> TrackLDC(const CFGRebuildState& state,
 
 std::optional<u64> TrackSHLRegister(const CFGRebuildState& state, u32& pos,
                                     u64 ldc_tracked_register) {
-    return TrackInstruction<u64>(state, pos,
-                                 [ldc_tracked_register](auto instr, const auto& opcode) {
-                                     return opcode.GetId() == OpCode::Id::SHL_IMM &&
-                                            instr.gpr0.Value() == ldc_tracked_register;
-                                 },
-                                 [](auto instr, const auto&) { return instr.gpr8.Value(); });
+    return TrackInstruction<u64>(
+        state, pos,
+        [ldc_tracked_register](auto instr, const auto& opcode) {
+            return opcode.GetId() == OpCode::Id::SHL_IMM &&
+                   instr.gpr0.Value() == ldc_tracked_register;
+        },
+        [](auto instr, const auto&) { return instr.gpr8.Value(); });
 }
 
 std::optional<u32> TrackIMNMXValue(const CFGRebuildState& state, u32& pos,
                                    u64 shl_tracked_register) {
-    return TrackInstruction<u32>(state, pos,
-                                 [shl_tracked_register](auto instr, const auto& opcode) {
-                                     return opcode.GetId() == OpCode::Id::IMNMX_IMM &&
-                                            instr.gpr0.Value() == shl_tracked_register;
-                                 },
-                                 [](auto instr, const auto&) {
-                                     return static_cast<u32>(instr.alu.GetSignedImm20_20() + 1);
-                                 });
+    return TrackInstruction<u32>(
+        state, pos,
+        [shl_tracked_register](auto instr, const auto& opcode) {
+            return opcode.GetId() == OpCode::Id::IMNMX_IMM &&
+                   instr.gpr0.Value() == shl_tracked_register;
+        },
+        [](auto instr, const auto&) {
+            return static_cast<u32>(instr.alu.GetSignedImm20_20() + 1);
+        });
 }
 
 std::optional<BranchIndirectInfo> TrackBranchIndirectInfo(const CFGRebuildState& state, u32 pos) {
@@ -484,17 +476,17 @@ bool TryInspectAddress(CFGRebuildState& state) {
     }
     case BlockCollision::Inside: {
         // This case is the tricky one:
-        // We need to Split the block in 2 sepparate blocks
+        // We need to split the block into 2 separate blocks
         const u32 end = state.block_info[block_index].end;
         BlockInfo& new_block = CreateBlockInfo(state, address, end);
         BlockInfo& current_block = state.block_info[block_index];
         current_block.end = address - 1;
-        new_block.branch = current_block.branch;
+        new_block.branch = std::move(current_block.branch);
         BlockBranchInfo forward_branch = MakeBranchInfo<SingleBranch>();
         const auto branch = std::get_if<SingleBranch>(forward_branch.get());
         branch->address = address;
         branch->ignore = true;
-        current_block.branch = forward_branch;
+        current_block.branch = std::move(forward_branch);
         return true;
     }
     default:
@@ -555,13 +547,13 @@ bool TryQuery(CFGRebuildState& state) {
     gather_labels(q2.ssy_stack, state.ssy_labels, block);
     gather_labels(q2.pbk_stack, state.pbk_labels, block);
     if (std::holds_alternative<SingleBranch>(*block.branch)) {
-        const auto branch = std::get_if<SingleBranch>(block.branch.get());
+        auto* branch = std::get_if<SingleBranch>(block.branch.get());
         if (!branch->condition.IsUnconditional()) {
             q2.address = block.end + 1;
             state.queries.push_back(q2);
         }
 
-        Query conditional_query{q2};
+        auto& conditional_query = state.queries.emplace_back(q2);
         if (branch->is_sync) {
             if (branch->address == unassigned_branch) {
                 branch->address = conditional_query.ssy_stack.top();
@@ -575,23 +567,21 @@ bool TryQuery(CFGRebuildState& state) {
             conditional_query.pbk_stack.pop();
         }
         conditional_query.address = branch->address;
-        state.queries.push_back(std::move(conditional_query));
         return true;
     }
-    const auto multi_branch = std::get_if<MultiBranch>(block.branch.get());
+
+    const auto* multi_branch = std::get_if<MultiBranch>(block.branch.get());
     for (const auto& branch_case : multi_branch->branches) {
-        Query conditional_query{q2};
+        auto& conditional_query = state.queries.emplace_back(q2);
         conditional_query.address = branch_case.address;
-        state.queries.push_back(std::move(conditional_query));
     }
+
     return true;
 }
 
-} // Anonymous namespace
-
 void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
-    const auto get_expr = ([&](const Condition& cond) -> Expr {
-        Expr result{};
+    const auto get_expr = [](const Condition& cond) -> Expr {
+        Expr result;
         if (cond.cc != ConditionCode::T) {
             result = MakeExpr<ExprCondCode>(cond.cc);
         }
@@ -604,10 +594,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
             }
             Expr extra = MakeExpr<ExprPredicate>(pred);
             if (negate) {
-                extra = MakeExpr<ExprNot>(extra);
+                extra = MakeExpr<ExprNot>(std::move(extra));
             }
             if (result) {
-                return MakeExpr<ExprAnd>(extra, result);
+                return MakeExpr<ExprAnd>(std::move(extra), std::move(result));
             }
             return extra;
         }
@@ -615,9 +605,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
             return result;
         }
         return MakeExpr<ExprBoolean>(true);
-    });
+    };
+
     if (std::holds_alternative<SingleBranch>(*branch_info)) {
-        const auto branch = std::get_if<SingleBranch>(branch_info.get());
+        const auto* branch = std::get_if<SingleBranch>(branch_info.get());
         if (branch->address < 0) {
             if (branch->kill) {
                 mm.InsertReturn(get_expr(branch->condition), true);
@@ -629,7 +620,7 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) {
         mm.InsertGoto(get_expr(branch->condition), branch->address);
         return;
     }
-    const auto multi_branch = std::get_if<MultiBranch>(branch_info.get());
+    const auto* multi_branch = std::get_if<MultiBranch>(branch_info.get());
     for (const auto& branch_case : multi_branch->branches) {
         mm.InsertGoto(MakeExpr<ExprGprEqual>(multi_branch->gpr, branch_case.cmp_value),
                       branch_case.address);
@@ -655,6 +646,8 @@ void DecompileShader(CFGRebuildState& state) {
     state.manager->Decompile();
 }
 
+} // Anonymous namespace
+
 std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
                                                 const CompilerSettings& settings,
                                                 Registry& registry) {
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 87ac9ac6c..eeac328a6 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -13,6 +13,7 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -23,17 +24,6 @@ using Tegra::Shader::OpCode;
 
 namespace {
 
-/**
- * Returns whether the instruction at the specified offset is a 'sched' instruction.
- * Sched instructions always appear before a sequence of 3 instructions.
- */
-constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
-    constexpr u32 SchedPeriod = 4;
-    u32 absolute_offset = offset - main_offset;
-
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
 void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
                               const std::list<Sampler>& used_samplers) {
     if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) {
@@ -42,11 +32,11 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
     u32 count{};
     std::vector<u32> bound_offsets;
     for (const auto& sampler : used_samplers) {
-        if (sampler.IsBindless()) {
+        if (sampler.is_bindless) {
             continue;
         }
         ++count;
-        bound_offsets.emplace_back(sampler.GetOffset());
+        bound_offsets.emplace_back(sampler.offset);
     }
     if (count > 1) {
         gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets));
@@ -56,14 +46,14 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
 std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
                                         VideoCore::GuestDriverProfile& gpu_driver,
                                         const std::list<Sampler>& used_samplers) {
-    const u32 base_offset = sampler_to_deduce.GetOffset();
+    const u32 base_offset = sampler_to_deduce.offset;
     u32 max_offset{std::numeric_limits<u32>::max()};
     for (const auto& sampler : used_samplers) {
-        if (sampler.IsBindless()) {
+        if (sampler.is_bindless) {
             continue;
         }
-        if (sampler.GetOffset() > base_offset) {
-            max_offset = std::min(sampler.GetOffset(), max_offset);
+        if (sampler.offset > base_offset) {
+            max_offset = std::min(sampler.offset, max_offset);
         }
     }
     if (max_offset == std::numeric_limits<u32>::max()) {
@@ -265,7 +255,7 @@ void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
         Node n = Operation(OperationCode::Branch, Immediate(branch_case.address));
         Node op_b = Immediate(branch_case.cmp_value);
         Node condition =
-            GetPredicateComparisonInteger(Tegra::Shader::PredCondition::Equal, false, op_a, op_b);
+            GetPredicateComparisonInteger(Tegra::Shader::PredCondition::EQ, false, op_a, op_b);
         auto result = Conditional(condition, {n});
         bb.push_back(result);
         global_code.push_back(result);
@@ -363,14 +353,14 @@ void ShaderIR::PostDecode() {
         return;
     }
     for (auto& sampler : used_samplers) {
-        if (!sampler.IsIndexed()) {
+        if (!sampler.is_indexed) {
             continue;
         }
         if (const auto size = TryDeduceSamplerSize(sampler, gpu_driver, used_samplers)) {
-            sampler.SetSize(*size);
+            sampler.size = *size;
         } else {
             LOG_CRITICAL(HW_GPU, "Failed to deduce size of indexed sampler");
-            sampler.SetSize(1);
+            sampler.size = 1;
         }
     }
 }
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 4db329fa5..afef5948d 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -137,7 +137,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::FCMP_RR:
-    case OpCode::Id::FCMP_RC: {
+    case OpCode::Id::FCMP_RC:
+    case OpCode::Id::FCMP_IMMR: {
         UNIMPLEMENTED_IF(instr.fcmp.ftz == 0);
         Node op_c = GetRegister(instr.gpr39);
         Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f));
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index ee7d9a29d..88103fede 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -19,22 +19,49 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
-        opcode->get().GetId() == OpCode::Id::HADD2_R) {
+    bool negate_a = false;
+    bool negate_b = false;
+    bool absolute_a = false;
+    bool absolute_b = false;
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HADD2_R:
         if (instr.alu_half.ftz == 0) {
             LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HADD2_C:
+        if (instr.alu_half.ftz == 0) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+        }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 56) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_R:
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_C:
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
+    default:
+        UNREACHABLE();
+        break;
     }
 
-    const bool negate_a =
-        opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0;
-    const bool negate_b =
-        opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
-
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
+    op_a = GetOperandAbsNegHalf(op_a, absolute_a, negate_a);
 
-    auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> {
+    auto [type_b, op_b] = [this, instr, opcode]() -> std::pair<HalfType, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HMUL2_C:
@@ -48,17 +75,16 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
         }
     }();
     op_b = UnpackHalfFloat(op_b, type_b);
-    // redeclaration to avoid a bug in clang with reusing local bindings in lambdas
-    Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
+    op_b = GetOperandAbsNegHalf(op_b, absolute_b, negate_b);
 
-    Node value = [&]() {
+    Node value = [this, opcode, op_a, op_b = op_b] {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 0f4c3103a..73155966f 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -35,15 +35,38 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
     case OpCode::Id::IADD_C:
     case OpCode::Id::IADD_R:
     case OpCode::Id::IADD_IMM: {
-        UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD saturation not implemented");
+        UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD.SAT");
+        UNIMPLEMENTED_IF_MSG(instr.iadd.x && instr.generates_cc, "IADD.X Rd.CC");
 
         op_a = GetOperandAbsNegInteger(op_a, false, instr.alu_integer.negate_a, true);
         op_b = GetOperandAbsNegInteger(op_b, false, instr.alu_integer.negate_b, true);
 
-        const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b);
+        Node value = Operation(OperationCode::UAdd, op_a, op_b);
 
-        SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        if (instr.iadd.x) {
+            Node carry = GetInternalFlag(InternalFlag::Carry);
+            Node x = Operation(OperationCode::Select, std::move(carry), Immediate(1), Immediate(0));
+            value = Operation(OperationCode::UAdd, std::move(value), std::move(x));
+        }
+
+        if (instr.generates_cc) {
+            const Node i0 = Immediate(0);
+
+            Node zero = Operation(OperationCode::LogicalIEqual, value, i0);
+            Node sign = Operation(OperationCode::LogicalILessThan, value, i0);
+            Node carry = Operation(OperationCode::LogicalAddCarry, op_a, op_b);
+
+            Node pos_a = Operation(OperationCode::LogicalIGreaterThan, op_a, i0);
+            Node pos_b = Operation(OperationCode::LogicalIGreaterThan, op_b, i0);
+            Node pos = Operation(OperationCode::LogicalAnd, std::move(pos_a), std::move(pos_b));
+            Node overflow = Operation(OperationCode::LogicalAnd, pos, sign);
+
+            SetInternalFlag(bb, InternalFlag::Zero, std::move(zero));
+            SetInternalFlag(bb, InternalFlag::Sign, std::move(sign));
+            SetInternalFlag(bb, InternalFlag::Carry, std::move(carry));
+            SetInternalFlag(bb, InternalFlag::Overflow, std::move(overflow));
+        }
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::IADD3_C:
@@ -75,12 +98,12 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
         op_b = GetOperandAbsNegInteger(op_b, false, instr.iadd3.neg_b, true);
         op_c = GetOperandAbsNegInteger(op_c, false, instr.iadd3.neg_c, true);
 
-        const Node value = [&]() {
-            const Node add_ab = Operation(OperationCode::IAdd, NO_PRECISE, op_a, op_b);
+        const Node value = [&] {
+            Node add_ab = Operation(OperationCode::IAdd, NO_PRECISE, op_a, op_b);
             if (opcode->get().GetId() != OpCode::Id::IADD3_R) {
                 return Operation(OperationCode::IAdd, NO_PRECISE, add_ab, op_c);
             }
-            const Node shifted = [&]() {
+            const Node shifted = [&] {
                 switch (instr.iadd3.mode) {
                 case Tegra::Shader::IAdd3Mode::RightShift:
                     // TODO(tech4me): According to
@@ -249,8 +272,8 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
             }
             case OpCode::Id::LEA_IMM: {
                 const bool neg = instr.lea.imm.neg != 0;
-                return {Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
-                        GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                return {GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                        Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
                         Immediate(static_cast<u32>(instr.lea.imm.entry_b))};
             }
             case OpCode::Id::LEA_RZ: {
diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
index 73880db0e..2a30aab2b 100644
--- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
@@ -28,23 +28,26 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) {
     case OpCode::Id::IADD32I: {
         UNIMPLEMENTED_IF_MSG(instr.iadd32i.saturate, "IADD32I saturation is not implemented");
 
-        op_a = GetOperandAbsNegInteger(op_a, false, instr.iadd32i.negate_a, true);
+        op_a = GetOperandAbsNegInteger(std::move(op_a), false, instr.iadd32i.negate_a != 0, true);
 
-        const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b);
+        Node value = Operation(OperationCode::IAdd, PRECISE, std::move(op_a), std::move(op_b));
 
-        SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc != 0);
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::LOP32I: {
-        if (instr.alu.lop32i.invert_a)
-            op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_a);
+        if (instr.alu.lop32i.invert_a) {
+            op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_a));
+        }
 
-        if (instr.alu.lop32i.invert_b)
-            op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_b);
+        if (instr.alu.lop32i.invert_b) {
+            op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b));
+        }
 
-        WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, op_a, op_b,
-                            PredicateResultMode::None, Pred::UnusedIndex, instr.op_32.generates_cc);
+        WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, std::move(op_a),
+                            std::move(op_b), PredicateResultMode::None, Pred::UnusedIndex,
+                            instr.op_32.generates_cc != 0);
         break;
     }
     default:
@@ -58,14 +61,14 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) {
 void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation logic_op, Node op_a,
                                    Node op_b, PredicateResultMode predicate_mode, Pred predicate,
                                    bool sets_cc) {
-    const Node result = [&]() {
+    Node result = [&] {
         switch (logic_op) {
         case LogicOperation::And:
-            return Operation(OperationCode::IBitwiseAnd, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseAnd, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::Or:
-            return Operation(OperationCode::IBitwiseOr, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseOr, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::Xor:
-            return Operation(OperationCode::IBitwiseXor, PRECISE, op_a, op_b);
+            return Operation(OperationCode::IBitwiseXor, PRECISE, std::move(op_a), std::move(op_b));
         case LogicOperation::PassB:
             return op_b;
         default:
@@ -84,8 +87,8 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation
         return;
     case PredicateResultMode::NotZero: {
         // Set the predicate to true if the result is not zero.
-        const Node compare = Operation(OperationCode::LogicalINotEqual, result, Immediate(0));
-        SetPredicate(bb, static_cast<u64>(predicate), compare);
+        Node compare = Operation(OperationCode::LogicalINotEqual, std::move(result), Immediate(0));
+        SetPredicate(bb, static_cast<u64>(predicate), std::move(compare));
         break;
     }
     default:
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz == 0) {
-        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    PredCondition cond;
+    bool bf;
+    bool ftz;
+    bool neg_a;
+    bool abs_a;
+    bool neg_b;
+    bool abs_b;
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_C:
+    case OpCode::Id::HSET2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        bf = instr.Bit(53);
+        ftz = instr.Bit(54);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(56);
+        abs_b = instr.Bit(54);
+        break;
+    case OpCode::Id::HSET2_R:
+        cond = instr.hsetp2.reg.cond;
+        bf = instr.Bit(49);
+        ftz = instr.Bit(50);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(31);
+        abs_b = instr.Bit(30);
+        break;
+    default:
+        UNREACHABLE();
     }
 
-    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
-    Node op_b = [&]() {
+    Node op_b = [this, instr, opcode] {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::HSET2_C:
+            // Inform as unimplemented as this is not tested.
+            UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         case OpCode::Id::HSET2_R:
             return GetRegister(instr.gpr20);
+        case OpCode::Id::HSET2_IMM:
+            return UnpackHalfImmediate(instr, true);
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return Node{};
         }
     }();
-    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
-    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
-    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+    if (!ftz) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_R:
+        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+        [[fallthrough]];
+    case OpCode::Id::HSET2_C:
+        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+        break;
+    default:
+        break;
+    }
 
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
     // HSET2 operates on each half float in the pack.
     std::array<Node, 2> values;
     for (u32 i = 0; i < 2; ++i) {
-        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
-        const Node true_value = Immediate(raw_value << (i * 16));
-        const Node false_value = Immediate(0);
-
-        const Node comparison =
-            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
-        const Node predicate = Operation(combiner, comparison, second_pred);
+        const u32 raw_value = bf ? 0x3c00 : 0xffff;
+        Node true_value = Immediate(raw_value << (i * 16));
+        Node false_value = Immediate(0);
 
+        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+        Node predicate = Operation(combiner, comparison, second_pred);
         values[i] =
-            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
     }
 
-    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
-    SetRegister(bb, instr.gpr0, value);
+    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+    SetRegister(bb, instr.gpr0, move(value));
 
     return pc;
 }
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 08ebca38b..1ed4212ee 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -31,11 +31,11 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
                                std::size_t component) {
     const TextureFormat format{descriptor.format};
     switch (format) {
-    case TextureFormat::R16_G16_B16_A16:
-    case TextureFormat::R32_G32_B32_A32:
-    case TextureFormat::R32_G32_B32:
-    case TextureFormat::R32_G32:
-    case TextureFormat::R16_G16:
+    case TextureFormat::R16G16B16A16:
+    case TextureFormat::R32G32B32A32:
+    case TextureFormat::R32G32B32:
+    case TextureFormat::R32G32:
+    case TextureFormat::R16G16:
     case TextureFormat::R32:
     case TextureFormat::R16:
     case TextureFormat::R8:
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         break;
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::B10G11R11:
         if (component == 0) {
             return descriptor.b_type;
         }
@@ -107,9 +108,9 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
             return descriptor.r_type;
         }
         break;
-    case TextureFormat::G8R24:
-    case TextureFormat::G24R8:
-    case TextureFormat::G8R8:
+    case TextureFormat::R24G8:
+    case TextureFormat::R8G24:
+    case TextureFormat::R8G8:
     case TextureFormat::G4R4:
         if (component == 0) {
             return descriptor.g_type;
@@ -118,6 +119,8 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
             return descriptor.r_type;
         }
         break;
+    default:
+        break;
     }
     UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
@@ -136,15 +139,15 @@ bool IsComponentEnabled(std::size_t component_mask, std::size_t component) {
 
 u32 GetComponentSize(TextureFormat format, std::size_t component) {
     switch (format) {
-    case TextureFormat::R32_G32_B32_A32:
+    case TextureFormat::R32G32B32A32:
         return 32;
-    case TextureFormat::R16_G16_B16_A16:
+    case TextureFormat::R16G16B16A16:
         return 16;
-    case TextureFormat::R32_G32_B32:
+    case TextureFormat::R32G32B32:
         return component <= 2 ? 32 : 0;
-    case TextureFormat::R32_G32:
+    case TextureFormat::R32G32:
         return component <= 1 ? 32 : 0;
-    case TextureFormat::R16_G16:
+    case TextureFormat::R16G16:
         return component <= 1 ? 16 : 0;
     case TextureFormat::R32:
         return component == 0 ? 32 : 0;
@@ -191,7 +194,15 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 6;
         }
         return 0;
-    case TextureFormat::G8R24:
+    case TextureFormat::B10G11R11:
+        if (component == 1 || component == 2) {
+            return 11;
+        }
+        if (component == 0) {
+            return 10;
+        }
+        return 0;
+    case TextureFormat::R24G8:
         if (component == 0) {
             return 8;
         }
@@ -199,7 +210,7 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 24;
         }
         return 0;
-    case TextureFormat::G24R8:
+    case TextureFormat::R8G24:
         if (component == 0) {
             return 24;
         }
@@ -207,7 +218,7 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 8;
         }
         return 0;
-    case TextureFormat::G8R8:
+    case TextureFormat::R8G8:
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
@@ -223,24 +234,25 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     constexpr u8 B = 0b0100;
     constexpr u8 A = 0b1000;
     switch (format) {
-    case TextureFormat::R32_G32_B32_A32:
-    case TextureFormat::R16_G16_B16_A16:
+    case TextureFormat::R32G32B32A32:
+    case TextureFormat::R16G16B16A16:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A4B4G4R4:
     case TextureFormat::A5B5G5R1:
     case TextureFormat::A1B5G5R5:
         return std::size_t{R | G | B | A};
-    case TextureFormat::R32_G32_B32:
+    case TextureFormat::R32G32B32:
     case TextureFormat::R32_B24G8:
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::B10G11R11:
         return std::size_t{R | G | B};
-    case TextureFormat::R32_G32:
-    case TextureFormat::R16_G16:
-    case TextureFormat::G8R24:
-    case TextureFormat::G24R8:
-    case TextureFormat::G8R8:
+    case TextureFormat::R32G32:
+    case TextureFormat::R16G16:
+    case TextureFormat::R24G8:
+    case TextureFormat::R8G24:
+    case TextureFormat::R8G8:
     case TextureFormat::G4R4:
         return std::size_t{R | G};
     case TextureFormat::R32:
@@ -299,7 +311,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
             return {std::move(original_value), true};
         }
     default:
-        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
         return {std::move(original_value), true};
     }
 }
@@ -352,8 +364,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
                         registry.ObtainBoundSampler(static_cast<u32>(instr.image.index.Value()));
                 } else {
                     const Node image_register = GetRegister(instr.gpr39);
-                    const auto [base_image, buffer, offset] = TrackCbuf(
-                        image_register, global_code, static_cast<s64>(global_code.size()));
+                    const auto result = TrackCbuf(image_register, global_code,
+                                                  static_cast<s64>(global_code.size()));
+                    const auto buffer = std::get<1>(result);
+                    const auto offset = std::get<2>(result);
                     descriptor = registry.ObtainBindlessSampler(buffer, offset);
                 }
                 if (!descriptor) {
@@ -453,11 +467,14 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
                     return OperationCode::AtomicImageXor;
                 case Tegra::Shader::ImageAtomicOperation::Exch:
                     return OperationCode::AtomicImageExchange;
+                default:
+                    break;
                 }
+                break;
             default:
                 break;
             }
-            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
                               static_cast<u64>(instr.suatom_d.operation.Value()),
                               static_cast<u64>(instr.suatom_d.operation_type.Value()));
             return OperationCode::AtomicImageAdd;
@@ -483,11 +500,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
 Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) {
     const auto offset = static_cast<u32>(image.index.Value());
 
-    const auto it =
-        std::find_if(std::begin(used_images), std::end(used_images),
-                     [offset](const Image& entry) { return entry.GetOffset() == offset; });
+    const auto it = std::find_if(std::begin(used_images), std::end(used_images),
+                                 [offset](const Image& entry) { return entry.offset == offset; });
     if (it != std::end(used_images)) {
-        ASSERT(!it->IsBindless() && it->GetType() == it->GetType());
+        ASSERT(!it->is_bindless && it->type == type);
         return *it;
     }
 
@@ -497,16 +513,18 @@ Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType t
 
 Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) {
     const Node image_register = GetRegister(reg);
-    const auto [base_image, buffer, offset] =
+    const auto result =
         TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()));
 
-    const auto it =
-        std::find_if(std::begin(used_images), std::end(used_images),
-                     [buffer = buffer, offset = offset](const Image& entry) {
-                         return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
-                     });
+    const auto buffer = std::get<1>(result);
+    const auto offset = std::get<2>(result);
+
+    const auto it = std::find_if(std::begin(used_images), std::end(used_images),
+                                 [buffer, offset](const Image& entry) {
+                                     return entry.buffer == buffer && entry.offset == offset;
+                                 });
     if (it != std::end(used_images)) {
-        ASSERT(it->IsBindless() && it->GetType() == it->GetType());
+        ASSERT(it->is_bindless && it->type == type);
         return *it;
     }
 
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 8112ead3e..e2bba88dd 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -386,8 +386,8 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::RED: {
-        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
-        UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
+        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32, "type={}",
+                             static_cast<int>(instr.red.type.Value()));
         const auto [real_address, base_address, descriptor] =
             TrackGlobalMemory(bb, instr, true, true);
         if (!real_address || !base_address) {
@@ -396,7 +396,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
         Node value = GetRegister(instr.gpr0);
-        bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
+        bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
         break;
     }
     case OpCode::Id::ATOM: {
@@ -472,14 +472,14 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock&
 
     const auto [base_address, index, offset] =
         TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT_OR_EXECUTE_MSG(base_address != nullptr,
-                          { return std::make_tuple(nullptr, nullptr, GlobalMemoryBase{}); },
-                          "Global memory tracking failed");
+    ASSERT_OR_EXECUTE_MSG(
+        base_address != nullptr, { return std::make_tuple(nullptr, nullptr, GlobalMemoryBase{}); },
+        "Global memory tracking failed");
 
     bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset)));
 
     const GlobalMemoryBase descriptor{index, offset};
-    const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
+    const auto& entry = used_global_memory.try_emplace(descriptor).first;
     auto& usage = entry->second;
     usage.is_written |= is_write;
     usage.is_read |= is_read;
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d4f95b18c..29a7cfbfe 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -75,15 +75,14 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         const Node value = [this, instr] {
             switch (instr.sys20) {
             case SystemVariable::LaneId:
-                LOG_WARNING(HW_GPU, "S2R instruction with LaneId is incomplete");
-                return Immediate(0U);
+                return Operation(OperationCode::ThreadId);
             case SystemVariable::InvocationId:
                 return Operation(OperationCode::InvocationId);
             case SystemVariable::Ydirection:
                 return Operation(OperationCode::YNegate);
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
-                return Immediate(0U);
+                return Immediate(0x00ff'0000U);
             case SystemVariable::WscaleFactorXY:
                 UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
                 return Immediate(0U);
@@ -109,6 +108,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                 return Operation(OperationCode::WorkGroupIdY);
             case SystemVariable::CtaIdZ:
                 return Operation(OperationCode::WorkGroupIdZ);
+            case SystemVariable::EqMask:
+            case SystemVariable::LtMask:
+            case SystemVariable::LeMask:
+            case SystemVariable::GtMask:
+            case SystemVariable::GeMask:
+                uses_warps = true;
+                switch (instr.sys20) {
+                case SystemVariable::EqMask:
+                    return Operation(OperationCode::ThreadEqMask);
+                case SystemVariable::LtMask:
+                    return Operation(OperationCode::ThreadLtMask);
+                case SystemVariable::LeMask:
+                    return Operation(OperationCode::ThreadLeMask);
+                case SystemVariable::GtMask:
+                    return Operation(OperationCode::ThreadGtMask);
+                case SystemVariable::GeMask:
+                    return Operation(OperationCode::ThreadGeMask);
+                default:
+                    UNREACHABLE();
+                    return Immediate(0u);
+                }
             default:
                 UNIMPLEMENTED_MSG("Unhandled system move: {}",
                                   static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +292,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
         break;
     }
+    case OpCode::Id::BAR: {
+        UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
+        bb.push_back(Operation(OperationCode::Barrier));
+        break;
+    }
     case OpCode::Id::MEMBAR: {
-        UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
         UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
-        bb.push_back(Operation(OperationCode::MemoryBarrierGL));
+        const OperationCode type = [instr] {
+            switch (instr.membar.type) {
+            case Tegra::Shader::MembarType::CTA:
+                return OperationCode::MemoryBarrierGroup;
+            case Tegra::Shader::MembarType::GL:
+                return OperationCode::MemoryBarrierGlobal;
+            default:
+                UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
+                return OperationCode::MemoryBarrierGlobal;
+            }
+        }();
+        bb.push_back(Operation(type));
         break;
     }
     case OpCode::Id::DEPBAR: {
diff --git a/src/video_core/shader/decode/register_set_predicate.cpp b/src/video_core/shader/decode/register_set_predicate.cpp
index 8d54cce34..6116c31aa 100644
--- a/src/video_core/shader/decode/register_set_predicate.cpp
+++ b/src/video_core/shader/decode/register_set_predicate.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <utility>
+
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
@@ -10,20 +12,20 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
 namespace {
-constexpr u64 NUM_PROGRAMMABLE_PREDICATES = 7;
-}
+constexpr u64 NUM_CONDITION_CODES = 4;
+constexpr u64 NUM_PREDICATES = 7;
+} // namespace
 
 u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.p2r_r2p.mode != Tegra::Shader::R2pMode::Pr);
-
-    const Node apply_mask = [&] {
+    Node apply_mask = [this, opcode, instr] {
         switch (opcode->get().GetId()) {
         case OpCode::Id::R2P_IMM:
         case OpCode::Id::P2R_IMM:
@@ -34,39 +36,43 @@ u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) {
         }
     }();
 
-    const auto offset = static_cast<u32>(instr.p2r_r2p.byte) * 8;
+    const u32 offset = static_cast<u32>(instr.p2r_r2p.byte) * 8;
+
+    const bool cc = instr.p2r_r2p.mode == Tegra::Shader::R2pMode::Cc;
+    const u64 num_entries = cc ? NUM_CONDITION_CODES : NUM_PREDICATES;
+    const auto get_entry = [this, cc](u64 entry) {
+        return cc ? GetInternalFlag(static_cast<InternalFlag>(entry)) : GetPredicate(entry);
+    };
 
     switch (opcode->get().GetId()) {
     case OpCode::Id::R2P_IMM: {
-        const Node mask = GetRegister(instr.gpr8);
+        Node mask = GetRegister(instr.gpr8);
 
-        for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) {
-            const auto shift = static_cast<u32>(pred);
+        for (u64 entry = 0; entry < num_entries; ++entry) {
+            const u32 shift = static_cast<u32>(entry);
 
-            const Node apply_compare = BitfieldExtract(apply_mask, shift, 1);
-            const Node condition =
-                Operation(OperationCode::LogicalUNotEqual, apply_compare, Immediate(0));
+            Node apply = BitfieldExtract(apply_mask, shift, 1);
+            Node condition = Operation(OperationCode::LogicalUNotEqual, apply, Immediate(0));
 
-            const Node value_compare = BitfieldExtract(mask, offset + shift, 1);
-            const Node value =
-                Operation(OperationCode::LogicalUNotEqual, value_compare, Immediate(0));
+            Node compare = BitfieldExtract(mask, offset + shift, 1);
+            Node value = Operation(OperationCode::LogicalUNotEqual, move(compare), Immediate(0));
 
-            const Node code = Operation(OperationCode::LogicalAssign, GetPredicate(pred), value);
-            bb.push_back(Conditional(condition, {code}));
+            Node code = Operation(OperationCode::LogicalAssign, get_entry(entry), move(value));
+            bb.push_back(Conditional(condition, {move(code)}));
         }
         break;
     }
     case OpCode::Id::P2R_IMM: {
         Node value = Immediate(0);
-        for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) {
-            Node bit = Operation(OperationCode::Select, GetPredicate(pred), Immediate(1U << pred),
+        for (u64 entry = 0; entry < num_entries; ++entry) {
+            Node bit = Operation(OperationCode::Select, get_entry(entry), Immediate(1U << entry),
                                  Immediate(0));
-            value = Operation(OperationCode::UBitwiseOr, std::move(value), std::move(bit));
+            value = Operation(OperationCode::UBitwiseOr, move(value), move(bit));
         }
-        value = Operation(OperationCode::UBitwiseAnd, std::move(value), apply_mask);
-        value = BitfieldInsert(GetRegister(instr.gpr8), std::move(value), offset, 8);
+        value = Operation(OperationCode::UBitwiseAnd, move(value), apply_mask);
+        value = BitfieldInsert(GetRegister(instr.gpr8), move(value), offset, 8);
 
-        SetRegister(bb, instr.gpr0, std::move(value));
+        SetRegister(bb, instr.gpr0, move(value));
         break;
     }
     default:
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index 3b391d3e6..d4ffa8014 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -23,7 +23,6 @@ Node IsFull(Node shift) {
 }
 
 Node Shift(OperationCode opcode, Node value, Node shift) {
-    Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32));
     Node shifted = Operation(opcode, move(value), shift);
     return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted));
 }
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 6c4a1358b..02fdccd86 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -139,15 +139,15 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         }
         const Node component = Immediate(static_cast<u32>(instr.tld4s.component));
 
-        const SamplerInfo info{TextureType::Texture2D, false, is_depth_compare};
-        const Sampler& sampler = *GetSampler(instr.sampler, info);
+        SamplerInfo info;
+        info.is_shadow = is_depth_compare;
+        const std::optional<Sampler> sampler = GetSampler(instr.sampler, info);
 
         Node4 values;
         for (u32 element = 0; element < values.size(); ++element) {
-            auto coords_copy = coords;
-            MetaTexture meta{sampler, {}, depth_compare, aoffi,   {}, {},
-                             {},      {}, component,     element, {}};
-            values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
+            MetaTexture meta{*sampler, {}, depth_compare, aoffi,   {}, {},
+                             {},       {}, component,     element, {}};
+            values[element] = Operation(OperationCode::TextureGather, meta, coords);
         }
 
         if (instr.tld4s.fp16_flag) {
@@ -165,19 +165,20 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                              "AOFFI is not implemented");
 
         const bool is_array = instr.txd.is_array != 0;
-        u64 base_reg = instr.gpr8.Value();
         const auto derivate_reg = instr.gpr20.Value();
         const auto texture_type = instr.txd.texture_type.Value();
         const auto coord_count = GetCoordCount(texture_type);
-        Node index_var{};
-        const Sampler* sampler =
-            is_bindless ? GetBindlessSampler(base_reg, index_var, {{texture_type, is_array, false}})
-                        : GetSampler(instr.sampler, {{texture_type, is_array, false}});
+        u64 base_reg = instr.gpr8.Value();
+        Node index_var;
+        SamplerInfo info;
+        info.type = texture_type;
+        info.is_array = is_array;
+        const std::optional<Sampler> sampler = is_bindless
+                                                   ? GetBindlessSampler(base_reg, info, index_var)
+                                                   : GetSampler(instr.sampler, info);
         Node4 values;
-        if (sampler == nullptr) {
-            for (u32 element = 0; element < values.size(); ++element) {
-                values[element] = Immediate(0);
-            }
+        if (!sampler) {
+            std::generate(values.begin(), values.end(), [this] { return Immediate(0); });
             WriteTexInstructionFloat(bb, instr, values);
             break;
         }
@@ -215,14 +216,12 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         is_bindless = true;
         [[fallthrough]];
     case OpCode::Id::TXQ: {
-        // TODO: The new commits on the texture refactor, change the way samplers work.
-        // Sadly, not all texture instructions specify the type of texture their sampler
-        // uses. This must be fixed at a later instance.
-        Node index_var{};
-        const Sampler* sampler =
-            is_bindless ? GetBindlessSampler(instr.gpr8, index_var) : GetSampler(instr.sampler);
-
-        if (sampler == nullptr) {
+        Node index_var;
+        const std::optional<Sampler> sampler = is_bindless
+                                                   ? GetBindlessSampler(instr.gpr8, {}, index_var)
+                                                   : GetSampler(instr.sampler, {});
+
+        if (!sampler) {
             u32 indexer = 0;
             for (u32 element = 0; element < 4; ++element) {
                 if (!instr.txq.IsComponentEnabled(element)) {
@@ -268,13 +267,17 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
                              "NDV is not implemented");
 
-        auto texture_type = instr.tmml.texture_type.Value();
+        const auto texture_type = instr.tmml.texture_type.Value();
         const bool is_array = instr.tmml.array != 0;
-        Node index_var{};
-        const Sampler* sampler =
-            is_bindless ? GetBindlessSampler(instr.gpr20, index_var) : GetSampler(instr.sampler);
-
-        if (sampler == nullptr) {
+        SamplerInfo info;
+        info.type = texture_type;
+        info.is_array = is_array;
+        Node index_var;
+        const std::optional<Sampler> sampler =
+            is_bindless ? GetBindlessSampler(instr.gpr20, info, index_var)
+                        : GetSampler(instr.sampler, info);
+
+        if (!sampler) {
             u32 indexer = 0;
             for (u32 element = 0; element < 2; ++element) {
                 if (!instr.tmml.IsComponentEnabled(element)) {
@@ -289,34 +292,36 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             break;
         }
 
-        std::vector<Node> coords;
-
-        // TODO: Add coordinates for different samplers once other texture types are implemented.
-        switch (texture_type) {
-        case TextureType::Texture1D:
-            coords.push_back(GetRegister(instr.gpr8));
-            break;
-        case TextureType::Texture2D:
-            coords.push_back(GetRegister(instr.gpr8.Value() + 0));
-            coords.push_back(GetRegister(instr.gpr8.Value() + 1));
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<u32>(texture_type));
+        const u64 base_index = is_array ? 1 : 0;
+        const u64 num_components = [texture_type] {
+            switch (texture_type) {
+            case TextureType::Texture1D:
+                return 1;
+            case TextureType::Texture2D:
+                return 2;
+            case TextureType::TextureCube:
+                return 3;
+            default:
+                UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type));
+                return 2;
+            }
+        }();
+        // TODO: What's the array component used for?
 
-            // Fallback to interpreting as a 2D texture for now
-            coords.push_back(GetRegister(instr.gpr8.Value() + 0));
-            coords.push_back(GetRegister(instr.gpr8.Value() + 1));
-            texture_type = TextureType::Texture2D;
+        std::vector<Node> coords;
+        coords.reserve(num_components);
+        for (u64 component = 0; component < num_components; ++component) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + base_index + component));
         }
+
         u32 indexer = 0;
         for (u32 element = 0; element < 2; ++element) {
             if (!instr.tmml.IsComponentEnabled(element)) {
                 continue;
             }
-            auto params = coords;
             MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var};
-            const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporary(bb, indexer++, value);
+            Node value = Operation(OperationCode::TextureQueryLod, meta, coords);
+            SetTemporary(bb, indexer++, std::move(value));
         }
         for (u32 i = 0; i < indexer; ++i) {
             SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
@@ -355,98 +360,122 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sampler_info, u32 offset,
-                                               std::optional<u32> buffer) {
-    if (sampler_info) {
-        return *sampler_info;
+ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
+    SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
+    if (info.IsComplete()) {
+        return info;
     }
-    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
-                                : registry.ObtainBoundSampler(offset);
     if (!sampler) {
         LOG_WARNING(HW_GPU, "Unknown sampler info");
-        return SamplerInfo{TextureType::Texture2D, false, false, false};
-    }
-    return SamplerInfo{sampler->texture_type, sampler->is_array != 0, sampler->is_shadow != 0,
-                       sampler->is_buffer != 0};
+        info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
+        info.is_array = info.is_array.value_or(false);
+        info.is_shadow = info.is_shadow.value_or(false);
+        info.is_buffer = info.is_buffer.value_or(false);
+        return info;
+    }
+    info.type = info.type.value_or(sampler->texture_type);
+    info.is_array = info.is_array.value_or(sampler->is_array != 0);
+    info.is_shadow = info.is_shadow.value_or(sampler->is_shadow != 0);
+    info.is_buffer = info.is_buffer.value_or(sampler->is_buffer != 0);
+    return info;
 }
 
-const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
-                                    std::optional<SamplerInfo> sampler_info) {
-    const auto offset = static_cast<u32>(sampler.index.Value());
-    const auto info = GetSamplerInfo(sampler_info, offset);
+std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
+                                            SamplerInfo sampler_info) {
+    const u32 offset = static_cast<u32>(sampler.index.Value());
+    const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
 
     // If this sampler has already been used, return the existing mapping.
-    const auto it =
-        std::find_if(used_samplers.begin(), used_samplers.end(),
-                     [offset](const Sampler& entry) { return entry.GetOffset() == offset; });
+    const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
+                                 [offset](const Sampler& entry) { return entry.offset == offset; });
     if (it != used_samplers.end()) {
-        ASSERT(!it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array &&
-               it->IsShadow() == info.is_shadow && it->IsBuffer() == info.is_buffer);
-        return &*it;
+        ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array &&
+               it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+        return *it;
     }
 
     // Otherwise create a new mapping for this sampler
     const auto next_index = static_cast<u32>(used_samplers.size());
-    return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow,
-                                       info.is_buffer, false);
+    return used_samplers.emplace_back(next_index, offset, *info.type, *info.is_array,
+                                      *info.is_shadow, *info.is_buffer, false);
 }
 
-const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var,
-                                            std::optional<SamplerInfo> sampler_info) {
+std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info,
+                                                    Node& index_var) {
     const Node sampler_register = GetRegister(reg);
     const auto [base_node, tracked_sampler_info] =
         TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT(base_node != nullptr);
-    if (base_node == nullptr) {
-        return nullptr;
+    if (!base_node) {
+        UNREACHABLE();
+        return std::nullopt;
     }
 
-    if (const auto bindless_sampler_info =
-            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
-        const u32 buffer = bindless_sampler_info->GetIndex();
-        const u32 offset = bindless_sampler_info->GetOffset();
-        const auto info = GetSamplerInfo(sampler_info, offset, buffer);
+    if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+        const u32 buffer = sampler_info->index;
+        const u32 offset = sampler_info->offset;
+        info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
 
         // If this sampler has already been used, return the existing mapping.
-        const auto it =
-            std::find_if(used_samplers.begin(), used_samplers.end(),
-                         [buffer = buffer, offset = offset](const Sampler& entry) {
-                             return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
-                         });
+        const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
+                                     [buffer, offset](const Sampler& entry) {
+                                         return entry.buffer == buffer && entry.offset == offset;
+                                     });
         if (it != used_samplers.end()) {
-            ASSERT(it->IsBindless() && it->GetType() == info.type &&
-                   it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow);
-            return &*it;
+            ASSERT(it->is_bindless && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow);
+            return *it;
         }
 
         // Otherwise create a new mapping for this sampler
         const auto next_index = static_cast<u32>(used_samplers.size());
-        return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array,
-                                           info.is_shadow, info.is_buffer, false);
-    } else if (const auto array_sampler_info =
-                   std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
-        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
-        index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
-        const auto info = GetSamplerInfo(sampler_info, base_offset);
+        return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer, false);
+    }
+    if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
+        const std::pair indices = sampler_info->indices;
+        const std::pair offsets = sampler_info->offsets;
+        info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
+
+        // Try to use an already created sampler if it exists
+        const auto it = std::find_if(
+            used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
+                return offsets == std::pair{entry.offset, entry.secondary_offset} &&
+                       indices == std::pair{entry.buffer, entry.secondary_buffer};
+            });
+        if (it != used_samplers.end()) {
+            ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+            return *it;
+        }
+
+        // Otherwise create a new mapping for this sampler
+        const u32 next_index = static_cast<u32>(used_samplers.size());
+        return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer);
+    }
+    if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+        const u32 base_offset = sampler_info->base_offset / 4;
+        index_var = GetCustomVariable(sampler_info->bindless_var);
+        info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(
             used_samplers.begin(), used_samplers.end(),
-            [base_offset](const Sampler& entry) { return entry.GetOffset() == base_offset; });
+            [base_offset](const Sampler& entry) { return entry.offset == base_offset; });
         if (it != used_samplers.end()) {
-            ASSERT(!it->IsBindless() && it->GetType() == info.type &&
-                   it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow &&
-                   it->IsBuffer() == info.is_buffer && it->IsIndexed());
-            return &*it;
+            ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer &&
+                   it->is_indexed);
+            return *it;
         }
 
         uses_indexed_samplers = true;
         // Otherwise create a new mapping for this sampler
         const auto next_index = static_cast<u32>(used_samplers.size());
-        return &used_samplers.emplace_back(next_index, base_offset, info.type, info.is_array,
-                                           info.is_shadow, info.is_buffer, true);
+        return used_samplers.emplace_back(next_index, base_offset, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer, true);
     }
-    return nullptr;
+    return std::nullopt;
 }
 
 void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
@@ -527,14 +556,19 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
     const bool is_shadow = depth_compare != nullptr;
     const bool is_bindless = bindless_reg.has_value();
 
-    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
     ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow,
                "Illegal texture type");
 
-    const SamplerInfo info{texture_type, is_array, is_shadow, false};
+    SamplerInfo info;
+    info.type = texture_type;
+    info.is_array = is_array;
+    info.is_shadow = is_shadow;
+    info.is_buffer = false;
+
     Node index_var;
-    const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info)
-                                         : GetSampler(instr.sampler, info);
+    const std::optional<Sampler> sampler = is_bindless
+                                               ? GetBindlessSampler(*bindless_reg, info, index_var)
+                                               : GetSampler(instr.sampler, info);
     if (!sampler) {
         return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)};
     }
@@ -593,8 +627,9 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
         ++parameter_register;
     }
 
-    const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement(
-        texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5);
+    const auto coord_counts = ValidateAndGetCoordinateElement(texture_type, depth_compare, is_array,
+                                                              lod_bias_enabled, 4, 5);
+    const auto coord_count = std::get<0>(coord_counts);
     // If enabled arrays index is always stored in the gpr8 field
     const u64 array_register = instr.gpr8.Value();
     // First coordinate index is the gpr8 or gpr8 + 1 when arrays are used
@@ -632,8 +667,10 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
     const bool lod_bias_enabled =
         (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ);
 
-    const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement(
-        texture_type, depth_compare, is_array, lod_bias_enabled, 4, 4);
+    const auto coord_counts = ValidateAndGetCoordinateElement(texture_type, depth_compare, is_array,
+                                                              lod_bias_enabled, 4, 4);
+    const auto coord_count = std::get<0>(coord_counts);
+
     // If enabled arrays index is always stored in the gpr8 field
     const u64 array_register = instr.gpr8.Value();
     // First coordinate index is stored in gpr8 field or (gpr8 + 1) when arrays are used
@@ -682,12 +719,17 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
 
     u64 parameter_register = instr.gpr20.Value();
 
-    const SamplerInfo info{texture_type, is_array, depth_compare, false};
-    Node index_var{};
-    const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, index_var, info)
-                                         : GetSampler(instr.sampler, info);
+    SamplerInfo info;
+    info.type = texture_type;
+    info.is_array = is_array;
+    info.is_shadow = depth_compare;
+
+    Node index_var;
+    const std::optional<Sampler> sampler =
+        is_bindless ? GetBindlessSampler(parameter_register++, info, index_var)
+                    : GetSampler(instr.sampler, info);
     Node4 values;
-    if (sampler == nullptr) {
+    if (!sampler) {
         for (u32 element = 0; element < values.size(); ++element) {
             values[element] = Immediate(0);
         }
@@ -723,7 +765,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
 
 Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
     const auto texture_type{instr.tld.texture_type};
-    const bool is_array{instr.tld.is_array};
+    const bool is_array{instr.tld.is_array != 0};
     const bool lod_enabled{instr.tld.GetTextureProcessMode() == TextureProcessMode::LL};
     const std::size_t coord_count{GetCoordCount(texture_type)};
 
@@ -742,12 +784,12 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
     // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr};
     // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr};
 
-    const auto& sampler = *GetSampler(instr.sampler);
+    const std::optional<Sampler> sampler = GetSampler(instr.sampler, {});
 
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}};
+        MetaTexture meta{*sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}};
         values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
     }
 
@@ -755,7 +797,11 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
 }
 
 Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is_array) {
-    const Sampler& sampler = *GetSampler(instr.sampler);
+    SamplerInfo info;
+    info.type = texture_type;
+    info.is_array = is_array;
+    info.is_shadow = false;
+    const std::optional<Sampler> sampler = GetSampler(instr.sampler, info);
 
     const std::size_t type_coord_count = GetCoordCount(texture_type);
     const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL;
@@ -783,7 +829,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}};
+        MetaTexture meta{*sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}};
         values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
     }
     return values;
diff --git a/src/video_core/shader/decode/video.cpp b/src/video_core/shader/decode/video.cpp
index 64ba60ea2..1c0957277 100644
--- a/src/video_core/shader/decode/video.cpp
+++ b/src/video_core/shader/decode/video.cpp
@@ -91,29 +91,28 @@ u32 ShaderIR::DecodeVideo(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed,
-                               Tegra::Shader::VideoType type, u64 byte_height) {
+Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed, VideoType type,
+                               u64 byte_height) {
     if (!is_chunk) {
         return BitfieldExtract(op, static_cast<u32>(byte_height * 8), 8);
     }
-    const Node zero = Immediate(0);
 
     switch (type) {
-    case Tegra::Shader::VideoType::Size16_Low:
+    case VideoType::Size16_Low:
         return BitfieldExtract(op, 0, 16);
-    case Tegra::Shader::VideoType::Size16_High:
+    case VideoType::Size16_High:
         return BitfieldExtract(op, 16, 16);
-    case Tegra::Shader::VideoType::Size32:
+    case VideoType::Size32:
         // TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when this type is used
         // (1 * 1 + 0 == 0x5b800000). Until a better explanation is found: abort.
         UNIMPLEMENTED();
-        return zero;
-    case Tegra::Shader::VideoType::Invalid:
+        return Immediate(0);
+    case VideoType::Invalid:
         UNREACHABLE_MSG("Invalid instruction encoding");
-        return zero;
+        return Immediate(0);
     default:
         UNREACHABLE();
-        return zero;
+        return Immediate(0);
     }
 }
 
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index 6191ffba1..233b8fa42 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -81,35 +81,36 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
     SetTemporary(bb, 0, product);
     product = GetTemporary(0);
 
-    const Node original_c = op_c;
+    Node original_c = op_c;
     const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error
-    op_c = [&]() {
+    op_c = [&] {
         switch (set_mode) {
         case Tegra::Shader::XmadMode::None:
             return original_c;
         case Tegra::Shader::XmadMode::CLo:
-            return BitfieldExtract(original_c, 0, 16);
+            return BitfieldExtract(std::move(original_c), 0, 16);
         case Tegra::Shader::XmadMode::CHi:
-            return BitfieldExtract(original_c, 16, 16);
+            return BitfieldExtract(std::move(original_c), 16, 16);
         case Tegra::Shader::XmadMode::CBcc: {
-            const Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b,
-                                                   original_b, Immediate(16));
-            return SignedOperation(OperationCode::IAdd, is_signed_c, original_c, shifted_b);
+            Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b,
+                                             original_b, Immediate(16));
+            return SignedOperation(OperationCode::IAdd, is_signed_c, std::move(original_c),
+                                   std::move(shifted_b));
         }
         case Tegra::Shader::XmadMode::CSfu: {
-            const Node comp_a = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_a,
-                                                              op_a, Immediate(0));
-            const Node comp_b = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_b,
-                                                              op_b, Immediate(0));
+            const Node comp_a =
+                GetPredicateComparisonInteger(PredCondition::EQ, is_signed_a, op_a, Immediate(0));
+            const Node comp_b =
+                GetPredicateComparisonInteger(PredCondition::EQ, is_signed_b, op_b, Immediate(0));
             const Node comp = Operation(OperationCode::LogicalOr, comp_a, comp_b);
 
             const Node comp_minus_a = GetPredicateComparisonInteger(
-                PredCondition::NotEqual, is_signed_a,
+                PredCondition::NE, is_signed_a,
                 SignedOperation(OperationCode::IBitwiseAnd, is_signed_a, op_a,
                                 Immediate(0x80000000)),
                 Immediate(0));
             const Node comp_minus_b = GetPredicateComparisonInteger(
-                PredCondition::NotEqual, is_signed_b,
+                PredCondition::NE, is_signed_b,
                 SignedOperation(OperationCode::IBitwiseAnd, is_signed_b, op_b,
                                 Immediate(0x80000000)),
                 Immediate(0));
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
new file mode 100644
index 000000000..e18ccba8e
--- /dev/null
+++ b/src/video_core/shader/memory_util.cpp
@@ -0,0 +1,76 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+
+#include <boost/container_hash/hash.hpp>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/shader/memory_util.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d,
+                          Tegra::Engines::Maxwell3D::Regs::ShaderProgram program) {
+    const auto& shader_config{maxwell3d.regs.shader_config[static_cast<std::size_t>(program)]};
+    return maxwell3d.regs.code_address.CodeAddress() + shader_config.offset;
+}
+
+bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
+    // Sched instructions appear once every 4 instructions.
+    constexpr std::size_t SchedPeriod = 4;
+    const std::size_t absolute_offset = offset - main_offset;
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute) {
+    // This is the encoded version of BRA that jumps to itself. All Nvidia
+    // shaders end with one.
+    static constexpr u64 SELF_JUMPING_BRANCH = 0xE2400FFFFF07000FULL;
+    static constexpr u64 MASK = 0xFFFFFFFFFF7FFFFFULL;
+
+    const std::size_t start_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+    std::size_t offset = start_offset;
+    while (offset < program.size()) {
+        const u64 instruction = program[offset];
+        if (!IsSchedInstruction(offset, start_offset)) {
+            if ((instruction & MASK) == SELF_JUMPING_BRANCH) {
+                // End on Maxwell's "nop" instruction
+                break;
+            }
+            if (instruction == 0) {
+                break;
+            }
+        }
+        ++offset;
+    }
+    // The last instruction is included in the program size
+    return std::min(offset + 1, program.size());
+}
+
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr,
+                          const u8* host_ptr, bool is_compute) {
+    ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
+    ASSERT_OR_EXECUTE(host_ptr != nullptr, { return code; });
+    memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64));
+    code.resize(CalculateProgramSize(code, is_compute));
+    return code;
+}
+
+u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
+                        const ProgramCode& code_b) {
+    size_t unique_identifier = boost::hash_value(code);
+    if (is_a) {
+        // VertexA programs include two programs
+        boost::hash_combine(unique_identifier, boost::hash_value(code_b));
+    }
+    return static_cast<u64>(unique_identifier);
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/memory_util.h b/src/video_core/shader/memory_util.h
new file mode 100644
index 000000000..4624d38e6
--- /dev/null
+++ b/src/video_core/shader/memory_util.h
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCommon::Shader {
+
+using ProgramCode = std::vector<u64>;
+
+constexpr u32 STAGE_MAIN_OFFSET = 10;
+constexpr u32 KERNEL_MAIN_OFFSET = 0;
+
+/// Gets the address for the specified shader stage program
+GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d,
+                          Tegra::Engines::Maxwell3D::Regs::ShaderProgram program);
+
+/// Gets if the current instruction offset is a scheduler instruction
+bool IsSchedInstruction(std::size_t offset, std::size_t main_offset);
+
+/// Calculates the size of a program stream
+std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute);
+
+/// Gets the shader program code from memory for the specified address
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr,
+                          const u8* host_ptr, bool is_compute);
+
+/// Hashes one (or two) program streams
+u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
+                        const ProgramCode& code_b = {});
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 3eee961f5..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -110,13 +110,20 @@ enum class OperationCode {
     LogicalPick2,  /// (bool2 pair, uint index) -> bool
     LogicalAnd2,   /// (bool2 a) -> bool
 
-    LogicalFLessThan,     /// (float a, float b) -> bool
-    LogicalFEqual,        /// (float a, float b) -> bool
-    LogicalFLessEqual,    /// (float a, float b) -> bool
-    LogicalFGreaterThan,  /// (float a, float b) -> bool
-    LogicalFNotEqual,     /// (float a, float b) -> bool
-    LogicalFGreaterEqual, /// (float a, float b) -> bool
-    LogicalFIsNan,        /// (float a) -> bool
+    LogicalFOrdLessThan,       /// (float a, float b) -> bool
+    LogicalFOrdEqual,          /// (float a, float b) -> bool
+    LogicalFOrdLessEqual,      /// (float a, float b) -> bool
+    LogicalFOrdGreaterThan,    /// (float a, float b) -> bool
+    LogicalFOrdNotEqual,       /// (float a, float b) -> bool
+    LogicalFOrdGreaterEqual,   /// (float a, float b) -> bool
+    LogicalFOrdered,           /// (float a, float b) -> bool
+    LogicalFUnordered,         /// (float a, float b) -> bool
+    LogicalFUnordLessThan,     /// (float a, float b) -> bool
+    LogicalFUnordEqual,        /// (float a, float b) -> bool
+    LogicalFUnordLessEqual,    /// (float a, float b) -> bool
+    LogicalFUnordGreaterThan,  /// (float a, float b) -> bool
+    LogicalFUnordNotEqual,     /// (float a, float b) -> bool
+    LogicalFUnordGreaterEqual, /// (float a, float b) -> bool
 
     LogicalILessThan,     /// (int a, int b) -> bool
     LogicalIEqual,        /// (int a, int b) -> bool
@@ -132,6 +139,8 @@ enum class OperationCode {
     LogicalUNotEqual,     /// (uint a, uint b) -> bool
     LogicalUGreaterEqual, /// (uint a, uint b) -> bool
 
+    LogicalAddCarry, /// (uint a, uint b) -> bool
+
     Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
     Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
     Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
@@ -217,9 +226,16 @@ enum class OperationCode {
     VoteEqual,    /// (bool) -> bool
 
     ThreadId,       /// () -> uint
+    ThreadEqMask,   /// () -> uint
+    ThreadGeMask,   /// () -> uint
+    ThreadGtMask,   /// () -> uint
+    ThreadLeMask,   /// () -> uint
+    ThreadLtMask,   /// () -> uint
     ShuffleIndexed, /// (uint value, uint index) -> uint
 
-    MemoryBarrierGL, /// () -> void
+    Barrier,             /// () -> void
+    MemoryBarrierGroup,  /// () -> void
+    MemoryBarrierGlobal, /// () -> void
 
     Amount,
 };
@@ -259,133 +275,76 @@ using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
 
-class BindlessSamplerNode;
-class ArraySamplerNode;
+struct ArraySamplerNode;
+struct BindlessSamplerNode;
+struct SeparateSamplerNode;
 
-using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
 using TrackSampler = std::shared_ptr<TrackSamplerData>;
 
-class Sampler {
-public:
-    /// This constructor is for bound samplers
+struct Sampler {
+    /// Bound samplers constructor
     constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type,
                                bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
           is_buffer{is_buffer}, is_indexed{is_indexed} {}
 
-    /// This constructor is for bindless samplers
+    /// Separate sampler constructor
+    constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
+                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
+                               bool is_buffer)
+        : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
+          buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
+          is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
+
+    /// Bindless samplers constructor
     constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
                                bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
           is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
 
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
-
-    constexpr u32 GetBuffer() const {
-        return buffer;
-    }
-
-    constexpr Tegra::Shader::TextureType GetType() const {
-        return type;
-    }
-
-    constexpr bool IsArray() const {
-        return is_array;
-    }
-
-    constexpr bool IsShadow() const {
-        return is_shadow;
-    }
-
-    constexpr bool IsBuffer() const {
-        return is_buffer;
-    }
-
-    constexpr bool IsBindless() const {
-        return is_bindless;
-    }
-
-    constexpr bool IsIndexed() const {
-        return is_indexed;
-    }
-
-    constexpr u32 Size() const {
-        return size;
-    }
-
-    constexpr void SetSize(u32 new_size) {
-        size = new_size;
-    }
-
-private:
-    u32 index{};  ///< Emulated index given for the this sampler.
-    u32 offset{}; ///< Offset in the const buffer from where the sampler is being read.
-    u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size{1};  ///< Size of the sampler.
+    u32 index = 0;            ///< Emulated index given for the this sampler.
+    u32 offset = 0;           ///< Offset in the const buffer from where the sampler is being read.
+    u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
+    u32 buffer = 0;           ///< Buffer where the bindless sampler is read.
+    u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
+    u32 size = 1;             ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
-    bool is_buffer{};   ///< Whether the texture is a texture buffer without sampler.
-    bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
-    bool is_indexed{};  ///< Whether this sampler is an indexed array of textures.
+    bool is_array = false;     ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow = false;    ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_buffer = false;    ///< Whether the texture is a texture buffer without sampler.
+    bool is_bindless = false;  ///< Whether this sampler belongs to a bindless texture or not.
+    bool is_indexed = false;   ///< Whether this sampler is an indexed array of textures.
+    bool is_separated = false; ///< Whether the image and sampler is separated or not.
 };
 
 /// Represents a tracked bindless sampler into a direct const buffer
-class ArraySamplerNode final {
-public:
-    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
-        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetBaseOffset() const {
-        return base_offset;
-    }
-
-    constexpr u32 GetIndexVar() const {
-        return bindless_var;
-    }
-
-private:
+struct ArraySamplerNode {
     u32 index;
     u32 base_offset;
     u32 bindless_var;
 };
 
-/// Represents a tracked bindless sampler into a direct const buffer
-class BindlessSamplerNode final {
-public:
-    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
+/// Represents a tracked separate sampler image pair that was folded statically
+struct SeparateSamplerNode {
+    std::pair<u32, u32> indices;
+    std::pair<u32, u32> offsets;
+};
 
-private:
+/// Represents a tracked bindless sampler into a direct const buffer
+struct BindlessSamplerNode {
     u32 index;
     u32 offset;
 };
 
-class Image final {
+struct Image {
 public:
-    /// This constructor is for bound images
+    /// Bound images constructor
     constexpr explicit Image(u32 index, u32 offset, Tegra::Shader::ImageType type)
         : index{index}, offset{offset}, type{type} {}
 
-    /// This constructor is for bindless samplers
+    /// Bindless samplers constructor
     constexpr explicit Image(u32 index, u32 offset, u32 buffer, Tegra::Shader::ImageType type)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_bindless{true} {}
 
@@ -403,53 +362,20 @@ public:
         is_atomic = true;
     }
 
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
-
-    constexpr u32 GetBuffer() const {
-        return buffer;
-    }
-
-    constexpr Tegra::Shader::ImageType GetType() const {
-        return type;
-    }
-
-    constexpr bool IsBindless() const {
-        return is_bindless;
-    }
-
-    constexpr bool IsWritten() const {
-        return is_written;
-    }
-
-    constexpr bool IsRead() const {
-        return is_read;
-    }
-
-    constexpr bool IsAtomic() const {
-        return is_atomic;
-    }
-
-private:
-    u32 index{};
-    u32 offset{};
-    u32 buffer{};
+    u32 index = 0;
+    u32 offset = 0;
+    u32 buffer = 0;
 
     Tegra::Shader::ImageType type{};
-    bool is_bindless{};
-    bool is_written{};
-    bool is_read{};
-    bool is_atomic{};
+    bool is_bindless = false;
+    bool is_written = false;
+    bool is_read = false;
+    bool is_atomic = false;
 };
 
 struct GlobalMemoryBase {
-    u32 cbuf_index{};
-    u32 cbuf_offset{};
+    u32 cbuf_index = 0;
+    u32 cbuf_offset = 0;
 
     bool operator<(const GlobalMemoryBase& rhs) const {
         return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset);
@@ -463,7 +389,7 @@ struct MetaArithmetic {
 
 /// Parameters describing a texture sampler
 struct MetaTexture {
-    const Sampler& sampler;
+    Sampler sampler;
     Node array;
     Node depth_compare;
     std::vector<Node> aoffi;
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
 template <typename T, typename... Args>
 TrackSampler MakeTrackSampler(Args&&... args) {
     static_assert(std::is_convertible_v<T, TrackSamplerData>);
-    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+    return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
 }
 
 template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..148d91fcb 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -24,44 +24,45 @@ GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterfac
     if (shader_stage == ShaderType::Compute) {
         return {};
     }
-    auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine);
-
-    GraphicsInfo info;
-    info.tfb_layouts = graphics.regs.tfb_layouts;
-    info.tfb_varying_locs = graphics.regs.tfb_varying_locs;
-    info.primitive_topology = graphics.regs.draw.topology;
-    info.tessellation_primitive = graphics.regs.tess_mode.prim;
-    info.tessellation_spacing = graphics.regs.tess_mode.spacing;
-    info.tfb_enabled = graphics.regs.tfb_enabled;
-    info.tessellation_clockwise = graphics.regs.tess_mode.cw;
-    return info;
+
+    auto& graphics = dynamic_cast<Tegra::Engines::Maxwell3D&>(engine);
+
+    return {
+        .tfb_layouts = graphics.regs.tfb_layouts,
+        .tfb_varying_locs = graphics.regs.tfb_varying_locs,
+        .primitive_topology = graphics.regs.draw.topology,
+        .tessellation_primitive = graphics.regs.tess_mode.prim,
+        .tessellation_spacing = graphics.regs.tess_mode.spacing,
+        .tfb_enabled = graphics.regs.tfb_enabled != 0,
+        .tessellation_clockwise = graphics.regs.tess_mode.cw.Value() != 0,
+    };
 }
 
 ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
     if (shader_stage != ShaderType::Compute) {
         return {};
     }
-    auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine);
+
+    auto& compute = dynamic_cast<Tegra::Engines::KeplerCompute&>(engine);
     const auto& launch = compute.launch_description;
 
-    ComputeInfo info;
-    info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z};
-    info.local_memory_size_in_words = launch.local_pos_alloc;
-    info.shared_memory_size_in_words = launch.shared_alloc;
-    return info;
+    return {
+        .workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z},
+        .shared_memory_size_in_words = launch.shared_alloc,
+        .local_memory_size_in_words = launch.local_pos_alloc,
+    };
 }
 
 } // Anonymous namespace
 
-Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info)
+Registry::Registry(ShaderType shader_stage, const SerializedRegistryInfo& info)
     : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile},
       bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {}
 
-Registry::Registry(Tegra::Engines::ShaderType shader_stage,
-                   Tegra::Engines::ConstBufferEngineInterface& engine)
-    : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()},
-      graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo(
-                                                                 shader_stage, engine)} {}
+Registry::Registry(ShaderType shader_stage, ConstBufferEngineInterface& engine_)
+    : stage{shader_stage}, engine{&engine_}, bound_buffer{engine_.GetBoundBuffer()},
+      graphics_info{MakeGraphicsInfo(shader_stage, engine_)}, compute_info{MakeComputeInfo(
+                                                                  shader_stage, engine_)} {}
 
 Registry::~Registry() = default;
 
@@ -93,8 +94,27 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
     return value;
 }
 
-std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
-                                                                                 u32 offset) {
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
+    std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
+    SeparateSamplerKey key;
+    key.buffers = buffers;
+    key.offsets = offsets;
+    const auto iter = separate_samplers.find(key);
+    if (iter != separate_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+
+    const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
+    const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
+    const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
+    separate_samplers.emplace(key, value);
+    return value;
+}
+
+std::optional<SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) {
     const std::pair key = {buffer, offset};
     const auto iter = bindless_samplers.find(key);
     if (iter != bindless_samplers.end()) {
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..4bebefdde 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
 
 namespace VideoCommon::Shader {
 
+struct SeparateSamplerKey {
+    std::pair<u32, u32> buffers;
+    std::pair<u32, u32> offsets;
+};
+
+} // namespace VideoCommon::Shader
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::Shader::SeparateSamplerKey> {
+    std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
+        return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
+                                key.offsets.second);
+    }
+};
+
+template <>
+struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
+    bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
+                    const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
+        return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
+    }
+};
+
+} // namespace std
+
+namespace VideoCommon::Shader {
+
 using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
 using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using SeparateSamplerMap =
+    std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
 using BindlessSamplerMap =
     std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
 
@@ -63,7 +94,7 @@ public:
     explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info);
 
     explicit Registry(Tegra::Engines::ShaderType shader_stage,
-                      Tegra::Engines::ConstBufferEngineInterface& engine);
+                      Tegra::Engines::ConstBufferEngineInterface& engine_);
 
     ~Registry();
 
@@ -73,6 +104,9 @@ public:
 
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
 
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
+        std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
+
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
 
     /// Inserts a key.
@@ -128,6 +162,7 @@ private:
     Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
     KeyMap keys;
     BoundSamplerMap bound_samplers;
+    SeparateSamplerMap separate_samplers;
     BindlessSamplerMap bindless_samplers;
     u32 bound_buffer;
     GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 8852c8a1b..29d794b34 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
@@ -56,8 +57,7 @@ Node ShaderIR::GetConstBuffer(u64 index_, u64 offset_) {
     const auto index = static_cast<u32>(index_);
     const auto offset = static_cast<u32>(offset_);
 
-    const auto [entry, is_new] = used_cbufs.try_emplace(index);
-    entry->second.MarkAsUsed(offset);
+    used_cbufs.try_emplace(index).first->second.MarkAsUsed(offset);
 
     return MakeNode<CbufNode>(index, Immediate(offset));
 }
@@ -66,8 +66,7 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
     const auto index = static_cast<u32>(index_);
     const auto offset = static_cast<u32>(offset_);
 
-    const auto [entry, is_new] = used_cbufs.try_emplace(index);
-    entry->second.MarkAsUsedIndirect();
+    used_cbufs.try_emplace(index).first->second.MarkAsUsedIndirect();
 
     Node final_offset = [&] {
         // Attempt to inline constant buffer without a variable offset. This is done to allow
@@ -113,9 +112,9 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff
 }
 
 Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) const {
-    const Node node = MakeNode<InternalFlagNode>(flag);
+    Node node = MakeNode<InternalFlagNode>(flag);
     if (negated) {
-        return Operation(OperationCode::LogicalNegate, node);
+        return Operation(OperationCode::LogicalNegate, std::move(node));
     }
     return node;
 }
@@ -166,6 +165,7 @@ Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signe
                                 std::move(value), Immediate(16));
         value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
                                 std::move(value), Immediate(16));
+        return value;
     case Register::Size::Word:
         // Default - do nothing
         return value;
@@ -244,56 +244,44 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
 }
 
 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
+    if (condition == PredCondition::T) {
+        return GetPredicate(true);
+    } else if (condition == PredCondition::F) {
+        return GetPredicate(false);
+    }
+
     static constexpr std::array comparison_table{
-        std::pair{PredCondition::LessThan, OperationCode::LogicalFLessThan},
-        std::pair{PredCondition::Equal, OperationCode::LogicalFEqual},
-        std::pair{PredCondition::LessEqual, OperationCode::LogicalFLessEqual},
-        std::pair{PredCondition::GreaterThan, OperationCode::LogicalFGreaterThan},
-        std::pair{PredCondition::NotEqual, OperationCode::LogicalFNotEqual},
-        std::pair{PredCondition::GreaterEqual, OperationCode::LogicalFGreaterEqual},
-        std::pair{PredCondition::LessThanWithNan, OperationCode::LogicalFLessThan},
-        std::pair{PredCondition::NotEqualWithNan, OperationCode::LogicalFNotEqual},
-        std::pair{PredCondition::LessEqualWithNan, OperationCode::LogicalFLessEqual},
-        std::pair{PredCondition::GreaterThanWithNan, OperationCode::LogicalFGreaterThan},
-        std::pair{PredCondition::GreaterEqualWithNan, OperationCode::LogicalFGreaterEqual},
+        OperationCode(0),
+        OperationCode::LogicalFOrdLessThan,       // LT
+        OperationCode::LogicalFOrdEqual,          // EQ
+        OperationCode::LogicalFOrdLessEqual,      // LE
+        OperationCode::LogicalFOrdGreaterThan,    // GT
+        OperationCode::LogicalFOrdNotEqual,       // NE
+        OperationCode::LogicalFOrdGreaterEqual,   // GE
+        OperationCode::LogicalFOrdered,           // NUM
+        OperationCode::LogicalFUnordered,         // NAN
+        OperationCode::LogicalFUnordLessThan,     // LTU
+        OperationCode::LogicalFUnordEqual,        // EQU
+        OperationCode::LogicalFUnordLessEqual,    // LEU
+        OperationCode::LogicalFUnordGreaterThan,  // GTU
+        OperationCode::LogicalFUnordNotEqual,     // NEU
+        OperationCode::LogicalFUnordGreaterEqual, // GEU
     };
+    const std::size_t index = static_cast<std::size_t>(condition);
+    ASSERT_MSG(index < std::size(comparison_table), "Invalid condition={}", index);
 
-    const auto comparison =
-        std::find_if(comparison_table.cbegin(), comparison_table.cend(),
-                     [condition](const auto entry) { return condition == entry.first; });
-    UNIMPLEMENTED_IF_MSG(comparison == comparison_table.cend(),
-                         "Unknown predicate comparison operation");
-
-    Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
-
-    if (condition == PredCondition::LessThanWithNan ||
-        condition == PredCondition::NotEqualWithNan ||
-        condition == PredCondition::LessEqualWithNan ||
-        condition == PredCondition::GreaterThanWithNan ||
-        condition == PredCondition::GreaterEqualWithNan) {
-        predicate = Operation(OperationCode::LogicalOr, predicate,
-                              Operation(OperationCode::LogicalFIsNan, op_a));
-        predicate = Operation(OperationCode::LogicalOr, predicate,
-                              Operation(OperationCode::LogicalFIsNan, op_b));
-    }
-
-    return predicate;
+    return Operation(comparison_table[index], op_a, op_b);
 }
 
 Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a,
                                              Node op_b) {
     static constexpr std::array comparison_table{
-        std::pair{PredCondition::LessThan, OperationCode::LogicalILessThan},
-        std::pair{PredCondition::Equal, OperationCode::LogicalIEqual},
-        std::pair{PredCondition::LessEqual, OperationCode::LogicalILessEqual},
-        std::pair{PredCondition::GreaterThan, OperationCode::LogicalIGreaterThan},
-        std::pair{PredCondition::NotEqual, OperationCode::LogicalINotEqual},
-        std::pair{PredCondition::GreaterEqual, OperationCode::LogicalIGreaterEqual},
-        std::pair{PredCondition::LessThanWithNan, OperationCode::LogicalILessThan},
-        std::pair{PredCondition::NotEqualWithNan, OperationCode::LogicalINotEqual},
-        std::pair{PredCondition::LessEqualWithNan, OperationCode::LogicalILessEqual},
-        std::pair{PredCondition::GreaterThanWithNan, OperationCode::LogicalIGreaterThan},
-        std::pair{PredCondition::GreaterEqualWithNan, OperationCode::LogicalIGreaterEqual},
+        std::pair{PredCondition::LT, OperationCode::LogicalILessThan},
+        std::pair{PredCondition::EQ, OperationCode::LogicalIEqual},
+        std::pair{PredCondition::LE, OperationCode::LogicalILessEqual},
+        std::pair{PredCondition::GT, OperationCode::LogicalIGreaterThan},
+        std::pair{PredCondition::NE, OperationCode::LogicalINotEqual},
+        std::pair{PredCondition::GE, OperationCode::LogicalIGreaterEqual},
     };
 
     const auto comparison =
@@ -302,32 +290,24 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
     UNIMPLEMENTED_IF_MSG(comparison == comparison_table.cend(),
                          "Unknown predicate comparison operation");
 
-    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a),
-                                     std::move(op_b));
-
-    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
-                             condition == PredCondition::NotEqualWithNan ||
-                             condition == PredCondition::LessEqualWithNan ||
-                             condition == PredCondition::GreaterThanWithNan ||
-                             condition == PredCondition::GreaterEqualWithNan,
-                         "NaN comparisons for integers are not implemented");
-    return predicate;
+    return SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a),
+                           std::move(op_b));
 }
 
 Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a,
                                           Node op_b) {
     static constexpr std::array comparison_table{
-        std::pair{PredCondition::LessThan, OperationCode::Logical2HLessThan},
-        std::pair{PredCondition::Equal, OperationCode::Logical2HEqual},
-        std::pair{PredCondition::LessEqual, OperationCode::Logical2HLessEqual},
-        std::pair{PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan},
-        std::pair{PredCondition::NotEqual, OperationCode::Logical2HNotEqual},
-        std::pair{PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual},
-        std::pair{PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan},
-        std::pair{PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan},
-        std::pair{PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan},
-        std::pair{PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan},
-        std::pair{PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan},
+        std::pair{PredCondition::LT, OperationCode::Logical2HLessThan},
+        std::pair{PredCondition::EQ, OperationCode::Logical2HEqual},
+        std::pair{PredCondition::LE, OperationCode::Logical2HLessEqual},
+        std::pair{PredCondition::GT, OperationCode::Logical2HGreaterThan},
+        std::pair{PredCondition::NE, OperationCode::Logical2HNotEqual},
+        std::pair{PredCondition::GE, OperationCode::Logical2HGreaterEqual},
+        std::pair{PredCondition::LTU, OperationCode::Logical2HLessThanWithNan},
+        std::pair{PredCondition::LEU, OperationCode::Logical2HLessEqualWithNan},
+        std::pair{PredCondition::GTU, OperationCode::Logical2HGreaterThanWithNan},
+        std::pair{PredCondition::NEU, OperationCode::Logical2HNotEqualWithNan},
+        std::pair{PredCondition::GEU, OperationCode::Logical2HGreaterEqualWithNan},
     };
 
     const auto comparison =
@@ -398,7 +378,7 @@ void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc
     if (!sets_cc) {
         return;
     }
-    Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f));
+    Node zerop = Operation(OperationCode::LogicalFOrdEqual, std::move(value), Immediate(0.0f));
     SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
     LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index c6e7bdf50..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,6 +18,7 @@
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
+#include "video_core/shader/memory_util.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/registry.h"
 
@@ -25,16 +26,13 @@ namespace VideoCommon::Shader {
 
 struct ShaderBlock;
 
-using ProgramCode = std::vector<u64>;
-
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 
-class ConstBuffer {
-public:
-    explicit ConstBuffer(u32 max_offset, bool is_indirect)
+struct ConstBuffer {
+    constexpr explicit ConstBuffer(u32 max_offset, bool is_indirect)
         : max_offset{max_offset}, is_indirect{is_indirect} {}
 
-    ConstBuffer() = default;
+    constexpr ConstBuffer() = default;
 
     void MarkAsUsed(u64 offset) {
         max_offset = std::max(max_offset, static_cast<u32>(offset));
@@ -57,8 +55,8 @@ public:
     }
 
 private:
-    u32 max_offset{};
-    bool is_indirect{};
+    u32 max_offset = 0;
+    bool is_indirect = false;
 };
 
 struct GlobalMemoryUsage {
@@ -192,10 +190,14 @@ private:
     friend class ASTDecoder;
 
     struct SamplerInfo {
-        Tegra::Shader::TextureType type;
-        bool is_array;
-        bool is_shadow;
-        bool is_buffer;
+        std::optional<Tegra::Shader::TextureType> type;
+        std::optional<bool> is_array;
+        std::optional<bool> is_shadow;
+        std::optional<bool> is_buffer;
+
+        constexpr bool IsComplete() const noexcept {
+            return type && is_array && is_shadow && is_buffer;
+        }
     };
 
     void Decode();
@@ -328,16 +330,15 @@ private:
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
 
     /// Queries the missing sampler info from the execution context.
-    SamplerInfo GetSamplerInfo(std::optional<SamplerInfo> sampler_info, u32 offset,
-                               std::optional<u32> buffer = std::nullopt);
+    SamplerInfo GetSamplerInfo(SamplerInfo info,
+                               std::optional<Tegra::Engines::SamplerDescriptor> sampler);
 
-    /// Accesses a texture sampler
-    const Sampler* GetSampler(const Tegra::Shader::Sampler& sampler,
-                              std::optional<SamplerInfo> sampler_info = std::nullopt);
+    /// Accesses a texture sampler.
+    std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
 
     /// Accesses a texture sampler for a bindless texture.
-    const Sampler* GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var,
-                                      std::optional<SamplerInfo> sampler_info = std::nullopt);
+    std::optional<Sampler> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info,
+                                              Node& index_var);
 
     /// Accesses an image.
     Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type);
@@ -408,8 +409,14 @@ private:
 
     std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                        s64 cursor);
+    std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                       s64 cursor);
+
+    std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
+                                                             const OperationNode& operation,
+                                                             Node gpr, Node base_offset,
+                                                             Node tracked, const NodeBlock& code,
+                                                             s64 cursor);
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 10739b37d..6be3ea92b 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
 namespace VideoCommon::Shader {
 
 namespace {
+
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
@@ -27,8 +28,9 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
 
         if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             const auto& conditional_code = conditional->GetCode();
-            auto [found, internal_cursor] = FindOperation(
+            auto result = FindOperation(
                 conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
+            auto& found = result.first;
             if (found) {
                 return {std::move(found), cursor};
             }
@@ -62,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
     if (const auto operation = std::get_if<OperationNode>(&*node)) {
         operation->SetAmendIndex(amend_index);
         return true;
-    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+    }
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
         conditional->SetAmendIndex(amend_index);
         return true;
     }
@@ -71,39 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
 
 } // Anonymous namespace
 
-std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                              s64 cursor) {
+std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                             s64 cursor) {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+        const u32 cbuf_index = cbuf->GetIndex();
+
         // Constant buffer found, test if it's an immediate
-        const auto offset = cbuf->GetOffset();
+        const auto& offset = cbuf->GetOffset();
         if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
-            auto track =
-                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+            auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
             return {tracked, track};
-        } else if (const auto operation = std::get_if<OperationNode>(&*offset)) {
+        }
+        if (const auto operation = std::get_if<OperationNode>(&*offset)) {
             const u32 bound_buffer = registry.GetBoundBuffer();
-            if (bound_buffer != cbuf->GetIndex()) {
+            if (bound_buffer != cbuf_index) {
                 return {};
             }
-            const auto pair = DecoupleIndirectRead(*operation);
-            if (!pair) {
-                return {};
+            if (const std::optional pair = DecoupleIndirectRead(*operation)) {
+                auto [gpr, base_offset] = *pair;
+                return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
+                                                  code, cursor);
             }
-            auto [gpr, base_offset] = *pair;
-            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            const auto& gpu_driver = registry.AccessGuestDriverProfile();
-            const u32 bindless_cv = NewCustomVariable();
-            const Node op =
-                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
-
-            const Node cv_node = GetCustomVariable(bindless_cv);
-            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
-            const std::size_t amend_index = DeclareAmend(amend_op);
-            AmendNodeCv(amend_index, code[cursor]);
-            // TODO Implement Bindless Index custom variable
-            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
-                                                            offset_inm->GetValue(), bindless_cv);
-            return {tracked, track};
         }
         return {};
     }
@@ -120,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
         return TrackBindlessSampler(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
-            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
-                std::get<0>(found)) {
-                // Cbuf found in operand.
+        const OperationNode& op = *operation;
+
+        const OperationCode opcode = operation->GetCode();
+        if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
+            ASSERT(op.GetOperandsCount() == 2);
+            auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
+            auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
+            if (node_a && node_b) {
+                auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
+                                                                   std::pair{offset_a, offset_b});
+                return {tracked, std::move(track)};
+            }
+        }
+        std::size_t i = op.GetOperandsCount();
+        while (i--) {
+            if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
+                // Constant buffer found in operand.
                 return found;
             }
         }
@@ -137,11 +141,31 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
     return {};
 }
 
+std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
+    const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
+    const NodeBlock& code, s64 cursor) {
+    const auto offset_imm = std::get<ImmediateNode>(*base_offset);
+    const auto& gpu_driver = registry.AccessGuestDriverProfile();
+    const u32 bindless_cv = NewCustomVariable();
+    const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
+    Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
+
+    Node cv_node = GetCustomVariable(bindless_cv);
+    Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
+    const std::size_t amend_index = DeclareAmend(std::move(amend_op));
+    AmendNodeCv(amend_index, code[cursor]);
+
+    // TODO: Implement bindless index custom variable
+    auto track =
+        MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
+    return {tracked, track};
+}
+
 std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
                                                s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
         // Constant buffer found, test if it's an immediate
-        const auto offset = cbuf->GetOffset();
+        const auto& offset = cbuf->GetOffset();
         if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
             return {tracked, cbuf->GetIndex(), immediate->GetValue()};
         }
@@ -151,21 +175,13 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
             return {};
         }
-        s64 current_cursor = cursor;
-        while (current_cursor > 0) {
-            // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
-            // register that it uses as operand
-            const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1);
-            current_cursor = new_cursor;
-            if (!source) {
-                continue;
-            }
-            const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor);
-            if (base_address != nullptr) {
-                return {base_address, index, offset};
-            }
+        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+        // register that it uses as operand
+        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
+        if (!source) {
+            return {};
         }
-        return {};
+        return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
@@ -186,15 +202,15 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
 std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
     // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
     // that it uses as operand
-    const auto [found, found_cursor] =
-        TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
+    const auto result = TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
+    const auto& found = result.first;
     if (!found) {
-        return {};
+        return std::nullopt;
     }
     if (const auto immediate = std::get_if<ImmediateNode>(&*found)) {
         return immediate->GetValue();
     }
-    return {};
+    return std::nullopt;
 }
 
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
new file mode 100644
index 000000000..015a789d6
--- /dev/null
+++ b/src/video_core/shader_cache.h
@@ -0,0 +1,240 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class T>
+class ShaderCache {
+    static constexpr u64 PAGE_BITS = 14;
+    static constexpr u64 PAGE_SIZE = u64(1) << PAGE_BITS;
+
+    struct Entry {
+        VAddr addr_start;
+        VAddr addr_end;
+        T* data;
+
+        bool is_memory_marked = true;
+
+        constexpr bool Overlaps(VAddr start, VAddr end) const noexcept {
+            return start < addr_end && addr_start < end;
+        }
+    };
+
+public:
+    virtual ~ShaderCache() = default;
+
+    /// @brief Removes shaders inside a given region
+    /// @note Checks for ranges
+    /// @param addr Start address of the invalidation
+    /// @param size Number of bytes of the invalidation
+    void InvalidateRegion(VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+        RemovePendingShaders();
+    }
+
+    /// @brief Unmarks a memory region as cached and marks it for removal
+    /// @param addr Start address of the CPU write operation
+    /// @param size Number of bytes of the CPU write operation
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+    }
+
+    /// @brief Flushes delayed removal operations
+    void SyncGuestHost() {
+        std::scoped_lock lock{invalidation_mutex};
+        RemovePendingShaders();
+    }
+
+    /// @brief Tries to obtain a cached shader starting in a given address
+    /// @note Doesn't check for ranges, the given address has to be the start of the shader
+    /// @param addr Start address of the shader, this doesn't cache for region
+    /// @return Pointer to a valid shader, nullptr when nothing is found
+    T* TryGet(VAddr addr) const {
+        std::scoped_lock lock{lookup_mutex};
+
+        const auto it = lookup_cache.find(addr);
+        if (it == lookup_cache.end()) {
+            return nullptr;
+        }
+        return it->second->data;
+    }
+
+protected:
+    explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {}
+
+    /// @brief Register in the cache a given entry
+    /// @param data Shader to store in the cache
+    /// @param addr Start address of the shader that will be registered
+    /// @param size Size in bytes of the shader
+    void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex, lookup_mutex};
+
+        const VAddr addr_end = addr + size;
+        Entry* const entry = NewEntry(addr, addr_end, data.get());
+
+        const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) {
+            invalidation_cache[page].push_back(entry);
+        }
+
+        storage.push_back(std::move(data));
+
+        rasterizer.UpdatePagesCachedCount(addr, size, 1);
+    }
+
+    /// @brief Called when a shader is going to be removed
+    /// @param shader Shader that will be removed
+    /// @pre invalidation_cache is locked
+    /// @pre lookup_mutex is locked
+    virtual void OnShaderRemoval([[maybe_unused]] T* shader) {}
+
+private:
+    /// @brief Invalidate pages in a given region
+    /// @pre invalidation_mutex is locked
+    void InvalidatePagesInRegion(VAddr addr, std::size_t size) {
+        const VAddr addr_end = addr + size;
+        const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) {
+            auto it = invalidation_cache.find(page);
+            if (it == invalidation_cache.end()) {
+                continue;
+            }
+            InvalidatePageEntries(it->second, addr, addr_end);
+        }
+    }
+
+    /// @brief Remove shaders marked for deletion
+    /// @pre invalidation_mutex is locked
+    void RemovePendingShaders() {
+        if (marked_for_removal.empty()) {
+            return;
+        }
+        // Remove duplicates
+        std::sort(marked_for_removal.begin(), marked_for_removal.end());
+        marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()),
+                                 marked_for_removal.end());
+
+        std::vector<T*> removed_shaders;
+        removed_shaders.reserve(marked_for_removal.size());
+
+        std::scoped_lock lock{lookup_mutex};
+
+        for (Entry* const entry : marked_for_removal) {
+            removed_shaders.push_back(entry->data);
+
+            const auto it = lookup_cache.find(entry->addr_start);
+            ASSERT(it != lookup_cache.end());
+            lookup_cache.erase(it);
+        }
+        marked_for_removal.clear();
+
+        if (!removed_shaders.empty()) {
+            RemoveShadersFromStorage(std::move(removed_shaders));
+        }
+    }
+
+    /// @brief Invalidates entries in a given range for the passed page
+    /// @param entries         Vector of entries in the page, it will be modified on overlaps
+    /// @param addr            Start address of the invalidation
+    /// @param addr_end        Non-inclusive end address of the invalidation
+    /// @pre invalidation_mutex is locked
+    void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
+        std::size_t index = 0;
+        while (index < entries.size()) {
+            Entry* const entry = entries[index];
+            if (!entry->Overlaps(addr, addr_end)) {
+                ++index;
+                continue;
+            }
+
+            UnmarkMemory(entry);
+            RemoveEntryFromInvalidationCache(entry);
+            marked_for_removal.push_back(entry);
+        }
+    }
+
+    /// @brief Removes all references to an entry in the invalidation cache
+    /// @param entry Entry to remove from the invalidation cache
+    /// @pre invalidation_mutex is locked
+    void RemoveEntryFromInvalidationCache(const Entry* entry) {
+        const u64 page_end = (entry->addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = entry->addr_start >> PAGE_BITS; page < page_end; ++page) {
+            const auto entries_it = invalidation_cache.find(page);
+            ASSERT(entries_it != invalidation_cache.end());
+            std::vector<Entry*>& entries = entries_it->second;
+
+            const auto entry_it = std::find(entries.begin(), entries.end(), entry);
+            ASSERT(entry_it != entries.end());
+            entries.erase(entry_it);
+        }
+    }
+
+    /// @brief Unmarks an entry from the rasterizer cache
+    /// @param entry Entry to unmark from memory
+    void UnmarkMemory(Entry* entry) {
+        if (!entry->is_memory_marked) {
+            return;
+        }
+        entry->is_memory_marked = false;
+
+        const VAddr addr = entry->addr_start;
+        const std::size_t size = entry->addr_end - addr;
+        rasterizer.UpdatePagesCachedCount(addr, size, -1);
+    }
+
+    /// @brief Removes a vector of shaders from a list
+    /// @param removed_shaders Shaders to be removed from the storage
+    /// @pre invalidation_mutex is locked
+    /// @pre lookup_mutex is locked
+    void RemoveShadersFromStorage(std::vector<T*> removed_shaders) {
+        // Notify removals
+        for (T* const shader : removed_shaders) {
+            OnShaderRemoval(shader);
+        }
+
+        // Remove them from the cache
+        const auto is_removed = [&removed_shaders](const std::unique_ptr<T>& shader) {
+            return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) !=
+                   removed_shaders.end();
+        };
+        std::erase_if(storage, is_removed);
+    }
+
+    /// @brief Creates a new entry in the lookup cache and returns its pointer
+    /// @pre lookup_mutex is locked
+    Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) {
+        auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
+        Entry* const entry_pointer = entry.get();
+
+        lookup_cache.emplace(addr, std::move(entry));
+        return entry_pointer;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+
+    mutable std::mutex lookup_mutex;
+    std::mutex invalidation_mutex;
+
+    std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache;
+    std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
+    std::vector<std::unique_ptr<T>> storage;
+    std::vector<Entry*> marked_for_removal;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/shader_notify.cpp b/src/video_core/shader_notify.cpp
new file mode 100644
index 000000000..c3c71657d
--- /dev/null
+++ b/src/video_core/shader_notify.cpp
@@ -0,0 +1,42 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/shader_notify.h"
+
+using namespace std::chrono_literals;
+
+namespace VideoCore {
+namespace {
+constexpr auto UPDATE_TICK = 32ms;
+}
+
+ShaderNotify::ShaderNotify() = default;
+ShaderNotify::~ShaderNotify() = default;
+
+std::size_t ShaderNotify::GetShadersBuilding() {
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto diff = now - last_update;
+    if (diff > UPDATE_TICK) {
+        std::shared_lock lock(mutex);
+        last_updated_count = accurate_count;
+    }
+    return last_updated_count;
+}
+
+std::size_t ShaderNotify::GetShadersBuildingAccurate() {
+    std::shared_lock lock{mutex};
+    return accurate_count;
+}
+
+void ShaderNotify::MarkShaderComplete() {
+    std::unique_lock lock{mutex};
+    accurate_count--;
+}
+
+void ShaderNotify::MarkSharderBuilding() {
+    std::unique_lock lock{mutex};
+    accurate_count++;
+}
+
+} // namespace VideoCore
diff --git a/src/video_core/shader_notify.h b/src/video_core/shader_notify.h
new file mode 100644
index 000000000..a9c92d179
--- /dev/null
+++ b/src/video_core/shader_notify.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <chrono>
+#include <shared_mutex>
+#include "common/common_types.h"
+
+namespace VideoCore {
+class ShaderNotify {
+public:
+    ShaderNotify();
+    ~ShaderNotify();
+
+    std::size_t GetShadersBuilding();
+    std::size_t GetShadersBuildingAccurate();
+
+    void MarkShaderComplete();
+    void MarkSharderBuilding();
+
+private:
+    std::size_t last_updated_count{};
+    std::size_t accurate_count{};
+    std::shared_mutex mutex;
+    std::chrono::high_resolution_clock::time_point last_update{};
+};
+} // namespace VideoCore
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index cc7181229..1688267bb 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -74,115 +74,131 @@ bool SurfaceTargetIsArray(SurfaceTarget target) {
 
 PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) {
     switch (format) {
-    case Tegra::DepthFormat::S8_Z24_UNORM:
-        return PixelFormat::S8Z24;
-    case Tegra::DepthFormat::Z24_S8_UNORM:
-        return PixelFormat::Z24S8;
-    case Tegra::DepthFormat::Z32_FLOAT:
-        return PixelFormat::Z32F;
-    case Tegra::DepthFormat::Z16_UNORM:
-        return PixelFormat::Z16;
-    case Tegra::DepthFormat::Z32_S8_X24_FLOAT:
-        return PixelFormat::Z32FS8;
+    case Tegra::DepthFormat::S8_UINT_Z24_UNORM:
+        return PixelFormat::S8_UINT_D24_UNORM;
+    case Tegra::DepthFormat::D24S8_UNORM:
+        return PixelFormat::D24_UNORM_S8_UINT;
+    case Tegra::DepthFormat::D32_FLOAT:
+        return PixelFormat::D32_FLOAT;
+    case Tegra::DepthFormat::D16_UNORM:
+        return PixelFormat::D16_UNORM;
+    case Tegra::DepthFormat::D32_FLOAT_S8X24_UINT:
+        return PixelFormat::D32_FLOAT_S8_UINT;
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-        UNREACHABLE();
-        return PixelFormat::S8Z24;
+        UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format));
+        return PixelFormat::S8_UINT_D24_UNORM;
     }
 }
 
 PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
     switch (format) {
-    case Tegra::RenderTargetFormat::RGBA8_SRGB:
-        return PixelFormat::RGBA8_SRGB;
-    case Tegra::RenderTargetFormat::RGBA8_UNORM:
-        return PixelFormat::ABGR8U;
-    case Tegra::RenderTargetFormat::RGBA8_SNORM:
-        return PixelFormat::ABGR8S;
-    case Tegra::RenderTargetFormat::RGBA8_UINT:
-        return PixelFormat::ABGR8UI;
-    case Tegra::RenderTargetFormat::BGRA8_SRGB:
-        return PixelFormat::BGRA8_SRGB;
-    case Tegra::RenderTargetFormat::BGRA8_UNORM:
-        return PixelFormat::BGRA8;
-    case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
-        return PixelFormat::A2B10G10R10U;
-    case Tegra::RenderTargetFormat::RGBA16_FLOAT:
-        return PixelFormat::RGBA16F;
-    case Tegra::RenderTargetFormat::RGBA16_UNORM:
-        return PixelFormat::RGBA16U;
-    case Tegra::RenderTargetFormat::RGBA16_SNORM:
-        return PixelFormat::RGBA16S;
-    case Tegra::RenderTargetFormat::RGBA16_UINT:
-        return PixelFormat::RGBA16UI;
-    case Tegra::RenderTargetFormat::RGBA32_FLOAT:
-        return PixelFormat::RGBA32F;
-    case Tegra::RenderTargetFormat::RG32_FLOAT:
-        return PixelFormat::RG32F;
-    case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
-        return PixelFormat::R11FG11FB10F;
-    case Tegra::RenderTargetFormat::B5G6R5_UNORM:
-        return PixelFormat::B5G6R5U;
-    case Tegra::RenderTargetFormat::BGR5A1_UNORM:
-        return PixelFormat::A1B5G5R5U;
-    case Tegra::RenderTargetFormat::RGBA32_UINT:
-        return PixelFormat::RGBA32UI;
-    case Tegra::RenderTargetFormat::R8_UNORM:
-        return PixelFormat::R8U;
-    case Tegra::RenderTargetFormat::R8_UINT:
-        return PixelFormat::R8UI;
-    case Tegra::RenderTargetFormat::RG16_FLOAT:
-        return PixelFormat::RG16F;
-    case Tegra::RenderTargetFormat::RG16_UINT:
-        return PixelFormat::RG16UI;
-    case Tegra::RenderTargetFormat::RG16_SINT:
-        return PixelFormat::RG16I;
-    case Tegra::RenderTargetFormat::RG16_UNORM:
-        return PixelFormat::RG16;
-    case Tegra::RenderTargetFormat::RG16_SNORM:
-        return PixelFormat::RG16S;
-    case Tegra::RenderTargetFormat::RG8_UNORM:
-        return PixelFormat::RG8U;
-    case Tegra::RenderTargetFormat::RG8_SNORM:
-        return PixelFormat::RG8S;
-    case Tegra::RenderTargetFormat::R16_FLOAT:
-        return PixelFormat::R16F;
+    case Tegra::RenderTargetFormat::R32B32G32A32_FLOAT:
+        return PixelFormat::R32G32B32A32_FLOAT;
+    case Tegra::RenderTargetFormat::R32G32B32A32_SINT:
+        return PixelFormat::R32G32B32A32_SINT;
+    case Tegra::RenderTargetFormat::R32G32B32A32_UINT:
+        return PixelFormat::R32G32B32A32_UINT;
+    case Tegra::RenderTargetFormat::R16G16B16A16_UNORM:
+        return PixelFormat::R16G16B16A16_UNORM;
+    case Tegra::RenderTargetFormat::R16G16B16A16_SNORM:
+        return PixelFormat::R16G16B16A16_SNORM;
+    case Tegra::RenderTargetFormat::R16G16B16A16_SINT:
+        return PixelFormat::R16G16B16A16_SINT;
+    case Tegra::RenderTargetFormat::R16G16B16A16_UINT:
+        return PixelFormat::R16G16B16A16_UINT;
+    case Tegra::RenderTargetFormat::R16G16B16A16_FLOAT:
+        return PixelFormat::R16G16B16A16_FLOAT;
+    case Tegra::RenderTargetFormat::R32G32_FLOAT:
+        return PixelFormat::R32G32_FLOAT;
+    case Tegra::RenderTargetFormat::R32G32_SINT:
+        return PixelFormat::R32G32_SINT;
+    case Tegra::RenderTargetFormat::R32G32_UINT:
+        return PixelFormat::R32G32_UINT;
+    case Tegra::RenderTargetFormat::R16G16B16X16_FLOAT:
+        return PixelFormat::R16G16B16X16_FLOAT;
+    case Tegra::RenderTargetFormat::B8G8R8A8_UNORM:
+        return PixelFormat::B8G8R8A8_UNORM;
+    case Tegra::RenderTargetFormat::B8G8R8A8_SRGB:
+        return PixelFormat::B8G8R8A8_SRGB;
+    case Tegra::RenderTargetFormat::A2B10G10R10_UNORM:
+        return PixelFormat::A2B10G10R10_UNORM;
+    case Tegra::RenderTargetFormat::A2B10G10R10_UINT:
+        return PixelFormat::A2B10G10R10_UINT;
+    case Tegra::RenderTargetFormat::A8B8G8R8_UNORM:
+        return PixelFormat::A8B8G8R8_UNORM;
+    case Tegra::RenderTargetFormat::A8B8G8R8_SRGB:
+        return PixelFormat::A8B8G8R8_SRGB;
+    case Tegra::RenderTargetFormat::A8B8G8R8_SNORM:
+        return PixelFormat::A8B8G8R8_SNORM;
+    case Tegra::RenderTargetFormat::A8B8G8R8_SINT:
+        return PixelFormat::A8B8G8R8_SINT;
+    case Tegra::RenderTargetFormat::A8B8G8R8_UINT:
+        return PixelFormat::A8B8G8R8_UINT;
+    case Tegra::RenderTargetFormat::R16G16_UNORM:
+        return PixelFormat::R16G16_UNORM;
+    case Tegra::RenderTargetFormat::R16G16_SNORM:
+        return PixelFormat::R16G16_SNORM;
+    case Tegra::RenderTargetFormat::R16G16_SINT:
+        return PixelFormat::R16G16_SINT;
+    case Tegra::RenderTargetFormat::R16G16_UINT:
+        return PixelFormat::R16G16_UINT;
+    case Tegra::RenderTargetFormat::R16G16_FLOAT:
+        return PixelFormat::R16G16_FLOAT;
+    case Tegra::RenderTargetFormat::B10G11R11_FLOAT:
+        return PixelFormat::B10G11R11_FLOAT;
+    case Tegra::RenderTargetFormat::R32_SINT:
+        return PixelFormat::R32_SINT;
+    case Tegra::RenderTargetFormat::R32_UINT:
+        return PixelFormat::R32_UINT;
+    case Tegra::RenderTargetFormat::R32_FLOAT:
+        return PixelFormat::R32_FLOAT;
+    case Tegra::RenderTargetFormat::R5G6B5_UNORM:
+        return PixelFormat::R5G6B5_UNORM;
+    case Tegra::RenderTargetFormat::A1R5G5B5_UNORM:
+        return PixelFormat::A1R5G5B5_UNORM;
+    case Tegra::RenderTargetFormat::R8G8_UNORM:
+        return PixelFormat::R8G8_UNORM;
+    case Tegra::RenderTargetFormat::R8G8_SNORM:
+        return PixelFormat::R8G8_SNORM;
+    case Tegra::RenderTargetFormat::R8G8_SINT:
+        return PixelFormat::R8G8_SINT;
+    case Tegra::RenderTargetFormat::R8G8_UINT:
+        return PixelFormat::R8G8_UINT;
     case Tegra::RenderTargetFormat::R16_UNORM:
-        return PixelFormat::R16U;
+        return PixelFormat::R16_UNORM;
     case Tegra::RenderTargetFormat::R16_SNORM:
-        return PixelFormat::R16S;
-    case Tegra::RenderTargetFormat::R16_UINT:
-        return PixelFormat::R16UI;
+        return PixelFormat::R16_SNORM;
     case Tegra::RenderTargetFormat::R16_SINT:
-        return PixelFormat::R16I;
-    case Tegra::RenderTargetFormat::R32_FLOAT:
-        return PixelFormat::R32F;
-    case Tegra::RenderTargetFormat::R32_SINT:
-        return PixelFormat::R32I;
-    case Tegra::RenderTargetFormat::R32_UINT:
-        return PixelFormat::R32UI;
-    case Tegra::RenderTargetFormat::RG32_UINT:
-        return PixelFormat::RG32UI;
-    case Tegra::RenderTargetFormat::RGBX16_FLOAT:
-        return PixelFormat::RGBX16F;
+        return PixelFormat::R16_SINT;
+    case Tegra::RenderTargetFormat::R16_UINT:
+        return PixelFormat::R16_UINT;
+    case Tegra::RenderTargetFormat::R16_FLOAT:
+        return PixelFormat::R16_FLOAT;
+    case Tegra::RenderTargetFormat::R8_UNORM:
+        return PixelFormat::R8_UNORM;
+    case Tegra::RenderTargetFormat::R8_SNORM:
+        return PixelFormat::R8_SNORM;
+    case Tegra::RenderTargetFormat::R8_SINT:
+        return PixelFormat::R8_SINT;
+    case Tegra::RenderTargetFormat::R8_UINT:
+        return PixelFormat::R8_UINT;
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-        UNREACHABLE();
-        return PixelFormat::RGBA8_SRGB;
+        UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<int>(format));
+        return PixelFormat::A8B8G8R8_UNORM;
     }
 }
 
 PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
     switch (format) {
-    case Tegra::FramebufferConfig::PixelFormat::ABGR8:
-        return PixelFormat::ABGR8U;
-    case Tegra::FramebufferConfig::PixelFormat::RGB565:
-        return PixelFormat::B5G6R5U;
-    case Tegra::FramebufferConfig::PixelFormat::BGRA8:
-        return PixelFormat::BGRA8;
+    case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM:
+        return PixelFormat::A8B8G8R8_UNORM;
+    case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM:
+        return PixelFormat::R5G6B5_UNORM;
+    case Tegra::FramebufferConfig::PixelFormat::B8G8R8A8_UNORM:
+        return PixelFormat::B8G8R8A8_UNORM;
     default:
         UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format));
-        return PixelFormat::ABGR8U;
+        return PixelFormat::A8B8G8R8_UNORM;
     }
 }
 
@@ -210,27 +226,27 @@ SurfaceType GetFormatType(PixelFormat pixel_format) {
 
 bool IsPixelFormatASTC(PixelFormat format) {
     switch (format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::ASTC_2D_5X4:
-    case PixelFormat::ASTC_2D_5X5:
-    case PixelFormat::ASTC_2D_8X8:
-    case PixelFormat::ASTC_2D_8X5:
+    case PixelFormat::ASTC_2D_4X4_UNORM:
+    case PixelFormat::ASTC_2D_5X4_UNORM:
+    case PixelFormat::ASTC_2D_5X5_UNORM:
+    case PixelFormat::ASTC_2D_8X8_UNORM:
+    case PixelFormat::ASTC_2D_8X5_UNORM:
     case PixelFormat::ASTC_2D_4X4_SRGB:
     case PixelFormat::ASTC_2D_5X4_SRGB:
     case PixelFormat::ASTC_2D_5X5_SRGB:
     case PixelFormat::ASTC_2D_8X8_SRGB:
     case PixelFormat::ASTC_2D_8X5_SRGB:
-    case PixelFormat::ASTC_2D_10X8:
+    case PixelFormat::ASTC_2D_10X8_UNORM:
     case PixelFormat::ASTC_2D_10X8_SRGB:
-    case PixelFormat::ASTC_2D_6X6:
+    case PixelFormat::ASTC_2D_6X6_UNORM:
     case PixelFormat::ASTC_2D_6X6_SRGB:
-    case PixelFormat::ASTC_2D_10X10:
+    case PixelFormat::ASTC_2D_10X10_UNORM:
     case PixelFormat::ASTC_2D_10X10_SRGB:
-    case PixelFormat::ASTC_2D_12X12:
+    case PixelFormat::ASTC_2D_12X12_UNORM:
     case PixelFormat::ASTC_2D_12X12_SRGB:
-    case PixelFormat::ASTC_2D_8X6:
+    case PixelFormat::ASTC_2D_8X6_UNORM:
     case PixelFormat::ASTC_2D_8X6_SRGB:
-    case PixelFormat::ASTC_2D_6X5:
+    case PixelFormat::ASTC_2D_6X5_UNORM:
     case PixelFormat::ASTC_2D_6X5_SRGB:
         return true;
     default:
@@ -240,12 +256,12 @@ bool IsPixelFormatASTC(PixelFormat format) {
 
 bool IsPixelFormatSRGB(PixelFormat format) {
     switch (format) {
-    case PixelFormat::RGBA8_SRGB:
-    case PixelFormat::BGRA8_SRGB:
-    case PixelFormat::DXT1_SRGB:
-    case PixelFormat::DXT23_SRGB:
-    case PixelFormat::DXT45_SRGB:
-    case PixelFormat::BC7U_SRGB:
+    case PixelFormat::A8B8G8R8_SRGB:
+    case PixelFormat::B8G8R8A8_SRGB:
+    case PixelFormat::BC1_RGBA_SRGB:
+    case PixelFormat::BC2_SRGB:
+    case PixelFormat::BC3_SRGB:
+    case PixelFormat::BC7_SRGB:
     case PixelFormat::ASTC_2D_4X4_SRGB:
     case PixelFormat::ASTC_2D_8X8_SRGB:
     case PixelFormat::ASTC_2D_8X5_SRGB:
@@ -267,25 +283,4 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
     return {GetDefaultBlockWidth(format), GetDefaultBlockHeight(format)};
 }
 
-bool IsFormatBCn(PixelFormat format) {
-    switch (format) {
-    case PixelFormat::DXT1:
-    case PixelFormat::DXT23:
-    case PixelFormat::DXT45:
-    case PixelFormat::DXN1:
-    case PixelFormat::DXN2SNORM:
-    case PixelFormat::DXN2UNORM:
-    case PixelFormat::BC7U:
-    case PixelFormat::BC6H_UF16:
-    case PixelFormat::BC6H_SF16:
-    case PixelFormat::DXT1_SRGB:
-    case PixelFormat::DXT23_SRGB:
-    case PixelFormat::DXT45_SRGB:
-    case PixelFormat::BC7U_SRGB:
-        return true;
-    default:
-        return false;
-    }
-}
-
 } // namespace VideoCore::Surface
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index e0acd44d3..cfd12fa61 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -15,93 +15,105 @@
 namespace VideoCore::Surface {
 
 enum class PixelFormat {
-    ABGR8U = 0,
-    ABGR8S = 1,
-    ABGR8UI = 2,
-    B5G6R5U = 3,
-    A2B10G10R10U = 4,
-    A1B5G5R5U = 5,
-    R8U = 6,
-    R8UI = 7,
-    RGBA16F = 8,
-    RGBA16U = 9,
-    RGBA16S = 10,
-    RGBA16UI = 11,
-    R11FG11FB10F = 12,
-    RGBA32UI = 13,
-    DXT1 = 14,
-    DXT23 = 15,
-    DXT45 = 16,
-    DXN1 = 17, // This is also known as BC4
-    DXN2UNORM = 18,
-    DXN2SNORM = 19,
-    BC7U = 20,
-    BC6H_UF16 = 21,
-    BC6H_SF16 = 22,
-    ASTC_2D_4X4 = 23,
-    BGRA8 = 24,
-    RGBA32F = 25,
-    RG32F = 26,
-    R32F = 27,
-    R16F = 28,
-    R16U = 29,
-    R16S = 30,
-    R16UI = 31,
-    R16I = 32,
-    RG16 = 33,
-    RG16F = 34,
-    RG16UI = 35,
-    RG16I = 36,
-    RG16S = 37,
-    RGB32F = 38,
-    RGBA8_SRGB = 39,
-    RG8U = 40,
-    RG8S = 41,
-    RG32UI = 42,
-    RGBX16F = 43,
-    R32UI = 44,
-    R32I = 45,
-    ASTC_2D_8X8 = 46,
-    ASTC_2D_8X5 = 47,
-    ASTC_2D_5X4 = 48,
-    BGRA8_SRGB = 49,
-    DXT1_SRGB = 50,
-    DXT23_SRGB = 51,
-    DXT45_SRGB = 52,
-    BC7U_SRGB = 53,
-    R4G4B4A4U = 54,
-    ASTC_2D_4X4_SRGB = 55,
-    ASTC_2D_8X8_SRGB = 56,
-    ASTC_2D_8X5_SRGB = 57,
-    ASTC_2D_5X4_SRGB = 58,
-    ASTC_2D_5X5 = 59,
-    ASTC_2D_5X5_SRGB = 60,
-    ASTC_2D_10X8 = 61,
-    ASTC_2D_10X8_SRGB = 62,
-    ASTC_2D_6X6 = 63,
-    ASTC_2D_6X6_SRGB = 64,
-    ASTC_2D_10X10 = 65,
-    ASTC_2D_10X10_SRGB = 66,
-    ASTC_2D_12X12 = 67,
-    ASTC_2D_12X12_SRGB = 68,
-    ASTC_2D_8X6 = 69,
-    ASTC_2D_8X6_SRGB = 70,
-    ASTC_2D_6X5 = 71,
-    ASTC_2D_6X5_SRGB = 72,
-    E5B9G9R9F = 73,
+    A8B8G8R8_UNORM,
+    A8B8G8R8_SNORM,
+    A8B8G8R8_SINT,
+    A8B8G8R8_UINT,
+    R5G6B5_UNORM,
+    B5G6R5_UNORM,
+    A1R5G5B5_UNORM,
+    A2B10G10R10_UNORM,
+    A2B10G10R10_UINT,
+    A1B5G5R5_UNORM,
+    R8_UNORM,
+    R8_SNORM,
+    R8_SINT,
+    R8_UINT,
+    R16G16B16A16_FLOAT,
+    R16G16B16A16_UNORM,
+    R16G16B16A16_SNORM,
+    R16G16B16A16_SINT,
+    R16G16B16A16_UINT,
+    B10G11R11_FLOAT,
+    R32G32B32A32_UINT,
+    BC1_RGBA_UNORM,
+    BC2_UNORM,
+    BC3_UNORM,
+    BC4_UNORM,
+    BC4_SNORM,
+    BC5_UNORM,
+    BC5_SNORM,
+    BC7_UNORM,
+    BC6H_UFLOAT,
+    BC6H_SFLOAT,
+    ASTC_2D_4X4_UNORM,
+    B8G8R8A8_UNORM,
+    R32G32B32A32_FLOAT,
+    R32G32B32A32_SINT,
+    R32G32_FLOAT,
+    R32G32_SINT,
+    R32_FLOAT,
+    R16_FLOAT,
+    R16_UNORM,
+    R16_SNORM,
+    R16_UINT,
+    R16_SINT,
+    R16G16_UNORM,
+    R16G16_FLOAT,
+    R16G16_UINT,
+    R16G16_SINT,
+    R16G16_SNORM,
+    R32G32B32_FLOAT,
+    A8B8G8R8_SRGB,
+    R8G8_UNORM,
+    R8G8_SNORM,
+    R8G8_SINT,
+    R8G8_UINT,
+    R32G32_UINT,
+    R16G16B16X16_FLOAT,
+    R32_UINT,
+    R32_SINT,
+    ASTC_2D_8X8_UNORM,
+    ASTC_2D_8X5_UNORM,
+    ASTC_2D_5X4_UNORM,
+    B8G8R8A8_SRGB,
+    BC1_RGBA_SRGB,
+    BC2_SRGB,
+    BC3_SRGB,
+    BC7_SRGB,
+    A4B4G4R4_UNORM,
+    ASTC_2D_4X4_SRGB,
+    ASTC_2D_8X8_SRGB,
+    ASTC_2D_8X5_SRGB,
+    ASTC_2D_5X4_SRGB,
+    ASTC_2D_5X5_UNORM,
+    ASTC_2D_5X5_SRGB,
+    ASTC_2D_10X8_UNORM,
+    ASTC_2D_10X8_SRGB,
+    ASTC_2D_6X6_UNORM,
+    ASTC_2D_6X6_SRGB,
+    ASTC_2D_10X10_UNORM,
+    ASTC_2D_10X10_SRGB,
+    ASTC_2D_12X12_UNORM,
+    ASTC_2D_12X12_SRGB,
+    ASTC_2D_8X6_UNORM,
+    ASTC_2D_8X6_SRGB,
+    ASTC_2D_6X5_UNORM,
+    ASTC_2D_6X5_SRGB,
+    E5B9G9R9_FLOAT,
 
     MaxColorFormat,
 
     // Depth formats
-    Z32F = 74,
-    Z16 = 75,
+    D32_FLOAT = MaxColorFormat,
+    D16_UNORM,
 
     MaxDepthFormat,
 
     // DepthStencil formats
-    Z24S8 = 76,
-    S8Z24 = 77,
-    Z32FS8 = 78,
+    D24_UNORM_S8_UINT = MaxDepthFormat,
+    S8_UINT_D24_UNORM,
+    D32_FLOAT_S8_UINT,
 
     MaxDepthStencilFormat,
 
@@ -129,85 +141,97 @@ enum class SurfaceTarget {
 };
 
 constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{
-    0, // ABGR8U
-    0, // ABGR8S
-    0, // ABGR8UI
-    0, // B5G6R5U
-    0, // A2B10G10R10U
-    0, // A1B5G5R5U
-    0, // R8U
-    0, // R8UI
-    0, // RGBA16F
-    0, // RGBA16U
-    0, // RGBA16S
-    0, // RGBA16UI
-    0, // R11FG11FB10F
-    0, // RGBA32UI
-    2, // DXT1
-    2, // DXT23
-    2, // DXT45
-    2, // DXN1
-    2, // DXN2UNORM
-    2, // DXN2SNORM
-    2, // BC7U
-    2, // BC6H_UF16
-    2, // BC6H_SF16
-    2, // ASTC_2D_4X4
-    0, // BGRA8
-    0, // RGBA32F
-    0, // RG32F
-    0, // R32F
-    0, // R16F
-    0, // R16U
-    0, // R16S
-    0, // R16UI
-    0, // R16I
-    0, // RG16
-    0, // RG16F
-    0, // RG16UI
-    0, // RG16I
-    0, // RG16S
-    0, // RGB32F
-    0, // RGBA8_SRGB
-    0, // RG8U
-    0, // RG8S
-    0, // RG32UI
-    0, // RGBX16F
-    0, // R32UI
-    0, // R32I
-    2, // ASTC_2D_8X8
-    2, // ASTC_2D_8X5
-    2, // ASTC_2D_5X4
-    0, // BGRA8_SRGB
-    2, // DXT1_SRGB
-    2, // DXT23_SRGB
-    2, // DXT45_SRGB
-    2, // BC7U_SRGB
-    0, // R4G4B4A4U
+    0, // A8B8G8R8_UNORM
+    0, // A8B8G8R8_SNORM
+    0, // A8B8G8R8_SINT
+    0, // A8B8G8R8_UINT
+    0, // R5G6B5_UNORM
+    0, // B5G6R5_UNORM
+    0, // A1R5G5B5_UNORM
+    0, // A2B10G10R10_UNORM
+    0, // A2B10G10R10_UINT
+    0, // A1B5G5R5_UNORM
+    0, // R8_UNORM
+    0, // R8_SNORM
+    0, // R8_SINT
+    0, // R8_UINT
+    0, // R16G16B16A16_FLOAT
+    0, // R16G16B16A16_UNORM
+    0, // R16G16B16A16_SNORM
+    0, // R16G16B16A16_SINT
+    0, // R16G16B16A16_UINT
+    0, // B10G11R11_FLOAT
+    0, // R32G32B32A32_UINT
+    2, // BC1_RGBA_UNORM
+    2, // BC2_UNORM
+    2, // BC3_UNORM
+    2, // BC4_UNORM
+    2, // BC4_SNORM
+    2, // BC5_UNORM
+    2, // BC5_SNORM
+    2, // BC7_UNORM
+    2, // BC6H_UFLOAT
+    2, // BC6H_SFLOAT
+    2, // ASTC_2D_4X4_UNORM
+    0, // B8G8R8A8_UNORM
+    0, // R32G32B32A32_FLOAT
+    0, // R32G32B32A32_SINT
+    0, // R32G32_FLOAT
+    0, // R32G32_SINT
+    0, // R32_FLOAT
+    0, // R16_FLOAT
+    0, // R16_UNORM
+    0, // R16_SNORM
+    0, // R16_UINT
+    0, // R16_SINT
+    0, // R16G16_UNORM
+    0, // R16G16_FLOAT
+    0, // R16G16_UINT
+    0, // R16G16_SINT
+    0, // R16G16_SNORM
+    0, // R32G32B32_FLOAT
+    0, // A8B8G8R8_SRGB
+    0, // R8G8_UNORM
+    0, // R8G8_SNORM
+    0, // R8G8_SINT
+    0, // R8G8_UINT
+    0, // R32G32_UINT
+    0, // R16G16B16X16_FLOAT
+    0, // R32_UINT
+    0, // R32_SINT
+    2, // ASTC_2D_8X8_UNORM
+    2, // ASTC_2D_8X5_UNORM
+    2, // ASTC_2D_5X4_UNORM
+    0, // B8G8R8A8_SRGB
+    2, // BC1_RGBA_SRGB
+    2, // BC2_SRGB
+    2, // BC3_SRGB
+    2, // BC7_SRGB
+    0, // A4B4G4R4_UNORM
     2, // ASTC_2D_4X4_SRGB
     2, // ASTC_2D_8X8_SRGB
     2, // ASTC_2D_8X5_SRGB
     2, // ASTC_2D_5X4_SRGB
-    2, // ASTC_2D_5X5
+    2, // ASTC_2D_5X5_UNORM
     2, // ASTC_2D_5X5_SRGB
-    2, // ASTC_2D_10X8
+    2, // ASTC_2D_10X8_UNORM
     2, // ASTC_2D_10X8_SRGB
-    2, // ASTC_2D_6X6
+    2, // ASTC_2D_6X6_UNORM
     2, // ASTC_2D_6X6_SRGB
-    2, // ASTC_2D_10X10
+    2, // ASTC_2D_10X10_UNORM
     2, // ASTC_2D_10X10_SRGB
-    2, // ASTC_2D_12X12
+    2, // ASTC_2D_12X12_UNORM
     2, // ASTC_2D_12X12_SRGB
-    2, // ASTC_2D_8X6
+    2, // ASTC_2D_8X6_UNORM
     2, // ASTC_2D_8X6_SRGB
-    2, // ASTC_2D_6X5
+    2, // ASTC_2D_6X5_UNORM
     2, // ASTC_2D_6X5_SRGB
-    0, // E5B9G9R9F
-    0, // Z32F
-    0, // Z16
-    0, // Z24S8
-    0, // S8Z24
-    0, // Z32FS8
+    0, // E5B9G9R9_FLOAT
+    0, // D32_FLOAT
+    0, // D16_UNORM
+    0, // D24_UNORM_S8_UINT
+    0, // S8_UINT_D24_UNORM
+    0, // D32_FLOAT_S8_UINT
 }};
 
 /**
@@ -227,85 +251,97 @@ inline constexpr u32 GetCompressionFactor(PixelFormat format) {
 }
 
 constexpr std::array<u32, MaxPixelFormat> block_width_table = {{
-    1,  // ABGR8U
-    1,  // ABGR8S
-    1,  // ABGR8UI
-    1,  // B5G6R5U
-    1,  // A2B10G10R10U
-    1,  // A1B5G5R5U
-    1,  // R8U
-    1,  // R8UI
-    1,  // RGBA16F
-    1,  // RGBA16U
-    1,  // RGBA16S
-    1,  // RGBA16UI
-    1,  // R11FG11FB10F
-    1,  // RGBA32UI
-    4,  // DXT1
-    4,  // DXT23
-    4,  // DXT45
-    4,  // DXN1
-    4,  // DXN2UNORM
-    4,  // DXN2SNORM
-    4,  // BC7U
-    4,  // BC6H_UF16
-    4,  // BC6H_SF16
-    4,  // ASTC_2D_4X4
-    1,  // BGRA8
-    1,  // RGBA32F
-    1,  // RG32F
-    1,  // R32F
-    1,  // R16F
-    1,  // R16U
-    1,  // R16S
-    1,  // R16UI
-    1,  // R16I
-    1,  // RG16
-    1,  // RG16F
-    1,  // RG16UI
-    1,  // RG16I
-    1,  // RG16S
-    1,  // RGB32F
-    1,  // RGBA8_SRGB
-    1,  // RG8U
-    1,  // RG8S
-    1,  // RG32UI
-    1,  // RGBX16F
-    1,  // R32UI
-    1,  // R32I
-    8,  // ASTC_2D_8X8
-    8,  // ASTC_2D_8X5
-    5,  // ASTC_2D_5X4
-    1,  // BGRA8_SRGB
-    4,  // DXT1_SRGB
-    4,  // DXT23_SRGB
-    4,  // DXT45_SRGB
-    4,  // BC7U_SRGB
-    1,  // R4G4B4A4U
+    1,  // A8B8G8R8_UNORM
+    1,  // A8B8G8R8_SNORM
+    1,  // A8B8G8R8_SINT
+    1,  // A8B8G8R8_UINT
+    1,  // R5G6B5_UNORM
+    1,  // B5G6R5_UNORM
+    1,  // A1R5G5B5_UNORM
+    1,  // A2B10G10R10_UNORM
+    1,  // A2B10G10R10_UINT
+    1,  // A1B5G5R5_UNORM
+    1,  // R8_UNORM
+    1,  // R8_SNORM
+    1,  // R8_SINT
+    1,  // R8_UINT
+    1,  // R16G16B16A16_FLOAT
+    1,  // R16G16B16A16_UNORM
+    1,  // R16G16B16A16_SNORM
+    1,  // R16G16B16A16_SINT
+    1,  // R16G16B16A16_UINT
+    1,  // B10G11R11_FLOAT
+    1,  // R32G32B32A32_UINT
+    4,  // BC1_RGBA_UNORM
+    4,  // BC2_UNORM
+    4,  // BC3_UNORM
+    4,  // BC4_UNORM
+    4,  // BC4_SNORM
+    4,  // BC5_UNORM
+    4,  // BC5_SNORM
+    4,  // BC7_UNORM
+    4,  // BC6H_UFLOAT
+    4,  // BC6H_SFLOAT
+    4,  // ASTC_2D_4X4_UNORM
+    1,  // B8G8R8A8_UNORM
+    1,  // R32G32B32A32_FLOAT
+    1,  // R32G32B32A32_SINT
+    1,  // R32G32_FLOAT
+    1,  // R32G32_SINT
+    1,  // R32_FLOAT
+    1,  // R16_FLOAT
+    1,  // R16_UNORM
+    1,  // R16_SNORM
+    1,  // R16_UINT
+    1,  // R16_SINT
+    1,  // R16G16_UNORM
+    1,  // R16G16_FLOAT
+    1,  // R16G16_UINT
+    1,  // R16G16_SINT
+    1,  // R16G16_SNORM
+    1,  // R32G32B32_FLOAT
+    1,  // A8B8G8R8_SRGB
+    1,  // R8G8_UNORM
+    1,  // R8G8_SNORM
+    1,  // R8G8_SINT
+    1,  // R8G8_UINT
+    1,  // R32G32_UINT
+    1,  // R16G16B16X16_FLOAT
+    1,  // R32_UINT
+    1,  // R32_SINT
+    8,  // ASTC_2D_8X8_UNORM
+    8,  // ASTC_2D_8X5_UNORM
+    5,  // ASTC_2D_5X4_UNORM
+    1,  // B8G8R8A8_SRGB
+    4,  // BC1_RGBA_SRGB
+    4,  // BC2_SRGB
+    4,  // BC3_SRGB
+    4,  // BC7_SRGB
+    1,  // A4B4G4R4_UNORM
     4,  // ASTC_2D_4X4_SRGB
     8,  // ASTC_2D_8X8_SRGB
     8,  // ASTC_2D_8X5_SRGB
     5,  // ASTC_2D_5X4_SRGB
-    5,  // ASTC_2D_5X5
+    5,  // ASTC_2D_5X5_UNORM
     5,  // ASTC_2D_5X5_SRGB
-    10, // ASTC_2D_10X8
+    10, // ASTC_2D_10X8_UNORM
     10, // ASTC_2D_10X8_SRGB
-    6,  // ASTC_2D_6X6
+    6,  // ASTC_2D_6X6_UNORM
     6,  // ASTC_2D_6X6_SRGB
-    10, // ASTC_2D_10X10
+    10, // ASTC_2D_10X10_UNORM
     10, // ASTC_2D_10X10_SRGB
-    12, // ASTC_2D_12X12
+    12, // ASTC_2D_12X12_UNORM
     12, // ASTC_2D_12X12_SRGB
-    8,  // ASTC_2D_8X6
+    8,  // ASTC_2D_8X6_UNORM
     8,  // ASTC_2D_8X6_SRGB
-    6,  // ASTC_2D_6X5
+    6,  // ASTC_2D_6X5_UNORM
     6,  // ASTC_2D_6X5_SRGB
-    1,  // E5B9G9R9F
-    1,  // Z32F
-    1,  // Z16
-    1,  // Z24S8
-    1,  // S8Z24
-    1,  // Z32FS8
+    1,  // E5B9G9R9_FLOAT
+    1,  // D32_FLOAT
+    1,  // D16_UNORM
+    1,  // D24_UNORM_S8_UINT
+    1,  // S8_UINT_D24_UNORM
+    1,  // D32_FLOAT_S8_UINT
 }};
 
 static constexpr u32 GetDefaultBlockWidth(PixelFormat format) {
@@ -317,85 +353,97 @@ static constexpr u32 GetDefaultBlockWidth(PixelFormat format) {
 }
 
 constexpr std::array<u32, MaxPixelFormat> block_height_table = {{
-    1,  // ABGR8U
-    1,  // ABGR8S
-    1,  // ABGR8UI
-    1,  // B5G6R5U
-    1,  // A2B10G10R10U
-    1,  // A1B5G5R5U
-    1,  // R8U
-    1,  // R8UI
-    1,  // RGBA16F
-    1,  // RGBA16U
-    1,  // RGBA16S
-    1,  // RGBA16UI
-    1,  // R11FG11FB10F
-    1,  // RGBA32UI
-    4,  // DXT1
-    4,  // DXT23
-    4,  // DXT45
-    4,  // DXN1
-    4,  // DXN2UNORM
-    4,  // DXN2SNORM
-    4,  // BC7U
-    4,  // BC6H_UF16
-    4,  // BC6H_SF16
-    4,  // ASTC_2D_4X4
-    1,  // BGRA8
-    1,  // RGBA32F
-    1,  // RG32F
-    1,  // R32F
-    1,  // R16F
-    1,  // R16U
-    1,  // R16S
-    1,  // R16UI
-    1,  // R16I
-    1,  // RG16
-    1,  // RG16F
-    1,  // RG16UI
-    1,  // RG16I
-    1,  // RG16S
-    1,  // RGB32F
-    1,  // RGBA8_SRGB
-    1,  // RG8U
-    1,  // RG8S
-    1,  // RG32UI
-    1,  // RGBX16F
-    1,  // R32UI
-    1,  // R32I
-    8,  // ASTC_2D_8X8
-    5,  // ASTC_2D_8X5
-    4,  // ASTC_2D_5X4
-    1,  // BGRA8_SRGB
-    4,  // DXT1_SRGB
-    4,  // DXT23_SRGB
-    4,  // DXT45_SRGB
-    4,  // BC7U_SRGB
-    1,  // R4G4B4A4U
+    1,  // A8B8G8R8_UNORM
+    1,  // A8B8G8R8_SNORM
+    1,  // A8B8G8R8_SINT
+    1,  // A8B8G8R8_UINT
+    1,  // R5G6B5_UNORM
+    1,  // B5G6R5_UNORM
+    1,  // A1R5G5B5_UNORM
+    1,  // A2B10G10R10_UNORM
+    1,  // A2B10G10R10_UINT
+    1,  // A1B5G5R5_UNORM
+    1,  // R8_UNORM
+    1,  // R8_SNORM
+    1,  // R8_SINT
+    1,  // R8_UINT
+    1,  // R16G16B16A16_FLOAT
+    1,  // R16G16B16A16_UNORM
+    1,  // R16G16B16A16_SNORM
+    1,  // R16G16B16A16_SINT
+    1,  // R16G16B16A16_UINT
+    1,  // B10G11R11_FLOAT
+    1,  // R32G32B32A32_UINT
+    4,  // BC1_RGBA_UNORM
+    4,  // BC2_UNORM
+    4,  // BC3_UNORM
+    4,  // BC4_UNORM
+    4,  // BC4_SNORM
+    4,  // BC5_UNORM
+    4,  // BC5_SNORM
+    4,  // BC7_UNORM
+    4,  // BC6H_UFLOAT
+    4,  // BC6H_SFLOAT
+    4,  // ASTC_2D_4X4_UNORM
+    1,  // B8G8R8A8_UNORM
+    1,  // R32G32B32A32_FLOAT
+    1,  // R32G32B32A32_SINT
+    1,  // R32G32_FLOAT
+    1,  // R32G32_SINT
+    1,  // R32_FLOAT
+    1,  // R16_FLOAT
+    1,  // R16_UNORM
+    1,  // R16_SNORM
+    1,  // R16_UINT
+    1,  // R16_SINT
+    1,  // R16G16_UNORM
+    1,  // R16G16_FLOAT
+    1,  // R16G16_UINT
+    1,  // R16G16_SINT
+    1,  // R16G16_SNORM
+    1,  // R32G32B32_FLOAT
+    1,  // A8B8G8R8_SRGB
+    1,  // R8G8_UNORM
+    1,  // R8G8_SNORM
+    1,  // R8G8_SINT
+    1,  // R8G8_UINT
+    1,  // R32G32_UINT
+    1,  // R16G16B16X16_FLOAT
+    1,  // R32_UINT
+    1,  // R32_SINT
+    8,  // ASTC_2D_8X8_UNORM
+    5,  // ASTC_2D_8X5_UNORM
+    4,  // ASTC_2D_5X4_UNORM
+    1,  // B8G8R8A8_SRGB
+    4,  // BC1_RGBA_SRGB
+    4,  // BC2_SRGB
+    4,  // BC3_SRGB
+    4,  // BC7_SRGB
+    1,  // A4B4G4R4_UNORM
     4,  // ASTC_2D_4X4_SRGB
     8,  // ASTC_2D_8X8_SRGB
     5,  // ASTC_2D_8X5_SRGB
     4,  // ASTC_2D_5X4_SRGB
-    5,  // ASTC_2D_5X5
+    5,  // ASTC_2D_5X5_UNORM
     5,  // ASTC_2D_5X5_SRGB
-    8,  // ASTC_2D_10X8
+    8,  // ASTC_2D_10X8_UNORM
     8,  // ASTC_2D_10X8_SRGB
-    6,  // ASTC_2D_6X6
+    6,  // ASTC_2D_6X6_UNORM
     6,  // ASTC_2D_6X6_SRGB
-    10, // ASTC_2D_10X10
+    10, // ASTC_2D_10X10_UNORM
     10, // ASTC_2D_10X10_SRGB
-    12, // ASTC_2D_12X12
+    12, // ASTC_2D_12X12_UNORM
     12, // ASTC_2D_12X12_SRGB
-    6,  // ASTC_2D_8X6
+    6,  // ASTC_2D_8X6_UNORM
     6,  // ASTC_2D_8X6_SRGB
-    5,  // ASTC_2D_6X5
+    5,  // ASTC_2D_6X5_UNORM
     5,  // ASTC_2D_6X5_SRGB
-    1,  // E5B9G9R9F
-    1,  // Z32F
-    1,  // Z16
-    1,  // Z24S8
-    1,  // S8Z24
-    1,  // Z32FS8
+    1,  // E5B9G9R9_FLOAT
+    1,  // D32_FLOAT
+    1,  // D16_UNORM
+    1,  // D24_UNORM_S8_UINT
+    1,  // S8_UINT_D24_UNORM
+    1,  // D32_FLOAT_S8_UINT
 }};
 
 static constexpr u32 GetDefaultBlockHeight(PixelFormat format) {
@@ -407,85 +455,97 @@ static constexpr u32 GetDefaultBlockHeight(PixelFormat format) {
 }
 
 constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
-    32,  // ABGR8U
-    32,  // ABGR8S
-    32,  // ABGR8UI
-    16,  // B5G6R5U
-    32,  // A2B10G10R10U
-    16,  // A1B5G5R5U
-    8,   // R8U
-    8,   // R8UI
-    64,  // RGBA16F
-    64,  // RGBA16U
-    64,  // RGBA16S
-    64,  // RGBA16UI
-    32,  // R11FG11FB10F
-    128, // RGBA32UI
-    64,  // DXT1
-    128, // DXT23
-    128, // DXT45
-    64,  // DXN1
-    128, // DXN2UNORM
-    128, // DXN2SNORM
-    128, // BC7U
-    128, // BC6H_UF16
-    128, // BC6H_SF16
-    128, // ASTC_2D_4X4
-    32,  // BGRA8
-    128, // RGBA32F
-    64,  // RG32F
-    32,  // R32F
-    16,  // R16F
-    16,  // R16U
-    16,  // R16S
-    16,  // R16UI
-    16,  // R16I
-    32,  // RG16
-    32,  // RG16F
-    32,  // RG16UI
-    32,  // RG16I
-    32,  // RG16S
-    96,  // RGB32F
-    32,  // RGBA8_SRGB
-    16,  // RG8U
-    16,  // RG8S
-    64,  // RG32UI
-    64,  // RGBX16F
-    32,  // R32UI
-    32,  // R32I
-    128, // ASTC_2D_8X8
-    128, // ASTC_2D_8X5
-    128, // ASTC_2D_5X4
-    32,  // BGRA8_SRGB
-    64,  // DXT1_SRGB
-    128, // DXT23_SRGB
-    128, // DXT45_SRGB
-    128, // BC7U
-    16,  // R4G4B4A4U
+    32,  // A8B8G8R8_UNORM
+    32,  // A8B8G8R8_SNORM
+    32,  // A8B8G8R8_SINT
+    32,  // A8B8G8R8_UINT
+    16,  // R5G6B5_UNORM
+    16,  // B5G6R5_UNORM
+    16,  // A1R5G5B5_UNORM
+    32,  // A2B10G10R10_UNORM
+    32,  // A2B10G10R10_UINT
+    16,  // A1B5G5R5_UNORM
+    8,   // R8_UNORM
+    8,   // R8_SNORM
+    8,   // R8_SINT
+    8,   // R8_UINT
+    64,  // R16G16B16A16_FLOAT
+    64,  // R16G16B16A16_UNORM
+    64,  // R16G16B16A16_SNORM
+    64,  // R16G16B16A16_SINT
+    64,  // R16G16B16A16_UINT
+    32,  // B10G11R11_FLOAT
+    128, // R32G32B32A32_UINT
+    64,  // BC1_RGBA_UNORM
+    128, // BC2_UNORM
+    128, // BC3_UNORM
+    64,  // BC4_UNORM
+    64,  // BC4_SNORM
+    128, // BC5_UNORM
+    128, // BC5_SNORM
+    128, // BC7_UNORM
+    128, // BC6H_UFLOAT
+    128, // BC6H_SFLOAT
+    128, // ASTC_2D_4X4_UNORM
+    32,  // B8G8R8A8_UNORM
+    128, // R32G32B32A32_FLOAT
+    128, // R32G32B32A32_SINT
+    64,  // R32G32_FLOAT
+    64,  // R32G32_SINT
+    32,  // R32_FLOAT
+    16,  // R16_FLOAT
+    16,  // R16_UNORM
+    16,  // R16_SNORM
+    16,  // R16_UINT
+    16,  // R16_SINT
+    32,  // R16G16_UNORM
+    32,  // R16G16_FLOAT
+    32,  // R16G16_UINT
+    32,  // R16G16_SINT
+    32,  // R16G16_SNORM
+    96,  // R32G32B32_FLOAT
+    32,  // A8B8G8R8_SRGB
+    16,  // R8G8_UNORM
+    16,  // R8G8_SNORM
+    16,  // R8G8_SINT
+    16,  // R8G8_UINT
+    64,  // R32G32_UINT
+    64,  // R16G16B16X16_FLOAT
+    32,  // R32_UINT
+    32,  // R32_SINT
+    128, // ASTC_2D_8X8_UNORM
+    128, // ASTC_2D_8X5_UNORM
+    128, // ASTC_2D_5X4_UNORM
+    32,  // B8G8R8A8_SRGB
+    64,  // BC1_RGBA_SRGB
+    128, // BC2_SRGB
+    128, // BC3_SRGB
+    128, // BC7_UNORM
+    16,  // A4B4G4R4_UNORM
     128, // ASTC_2D_4X4_SRGB
     128, // ASTC_2D_8X8_SRGB
     128, // ASTC_2D_8X5_SRGB
     128, // ASTC_2D_5X4_SRGB
-    128, // ASTC_2D_5X5
+    128, // ASTC_2D_5X5_UNORM
     128, // ASTC_2D_5X5_SRGB
-    128, // ASTC_2D_10X8
+    128, // ASTC_2D_10X8_UNORM
     128, // ASTC_2D_10X8_SRGB
-    128, // ASTC_2D_6X6
+    128, // ASTC_2D_6X6_UNORM
     128, // ASTC_2D_6X6_SRGB
-    128, // ASTC_2D_10X10
+    128, // ASTC_2D_10X10_UNORM
     128, // ASTC_2D_10X10_SRGB
-    128, // ASTC_2D_12X12
+    128, // ASTC_2D_12X12_UNORM
     128, // ASTC_2D_12X12_SRGB
-    128, // ASTC_2D_8X6
+    128, // ASTC_2D_8X6_UNORM
     128, // ASTC_2D_8X6_SRGB
-    128, // ASTC_2D_6X5
+    128, // ASTC_2D_6X5_UNORM
     128, // ASTC_2D_6X5_SRGB
-    32,  // E5B9G9R9F
-    32,  // Z32F
-    16,  // Z16
-    32,  // Z24S8
-    32,  // S8Z24
-    64,  // Z32FS8
+    32,  // E5B9G9R9_FLOAT
+    32,  // D32_FLOAT
+    16,  // D16_UNORM
+    32,  // D24_UNORM_S8_UINT
+    32,  // S8_UINT_D24_UNORM
+    64,  // D32_FLOAT_S8_UINT
 }};
 
 static constexpr u32 GetFormatBpp(PixelFormat format) {
@@ -524,7 +584,4 @@ bool IsPixelFormatSRGB(PixelFormat format);
 
 std::pair<u32, u32> GetASTCBlockSize(PixelFormat format);
 
-/// Returns true if the specified PixelFormat is a BCn format, e.g. DXT or DXN
-bool IsFormatBCn(PixelFormat format);
-
 } // namespace VideoCore::Surface
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index e151c26c4..7d5a75648 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -19,8 +19,6 @@ constexpr auto SNORM = ComponentType::SNORM;
 constexpr auto UNORM = ComponentType::UNORM;
 constexpr auto SINT = ComponentType::SINT;
 constexpr auto UINT = ComponentType::UINT;
-constexpr auto SNORM_FORCE_FP16 = ComponentType::SNORM_FORCE_FP16;
-constexpr auto UNORM_FORCE_FP16 = ComponentType::UNORM_FORCE_FP16;
 constexpr auto FLOAT = ComponentType::FLOAT;
 constexpr bool C = false; // Normal color
 constexpr bool S = true;  // Srgb
@@ -41,117 +39,126 @@ struct Table {
     ComponentType alpha_component;
     bool is_srgb;
 };
-constexpr std::array<Table, 76> DefinitionTable = {{
-    {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
-    {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
-    {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
-    {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA8_SRGB},
+constexpr std::array<Table, 86> DefinitionTable = {{
+    {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_UNORM},
+    {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::A8B8G8R8_SNORM},
+    {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::A8B8G8R8_UINT},
+    {TextureFormat::A8R8G8B8, C, SINT, SINT, SINT, SINT, PixelFormat::A8B8G8R8_SINT},
+    {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_SRGB},
 
-    {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5U},
+    {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5_UNORM},
 
-    {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10U},
+    {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10_UNORM},
+    {TextureFormat::A2B10G10R10, C, UINT, UINT, UINT, UINT, PixelFormat::A2B10G10R10_UINT},
 
-    {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5U},
+    {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5_UNORM},
 
-    {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R4G4B4A4U},
+    {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A4B4G4R4_UNORM},
 
-    {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8U},
-    {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8UI},
+    {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8_UNORM},
+    {TextureFormat::R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8_SNORM},
+    {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8_UINT},
+    {TextureFormat::R8, C, SINT, SINT, SINT, SINT, PixelFormat::R8_SINT},
 
-    {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U},
-    {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S},
+    {TextureFormat::R8G8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8G8_UNORM},
+    {TextureFormat::R8G8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8G8_SNORM},
+    {TextureFormat::R8G8, C, UINT, UINT, UINT, UINT, PixelFormat::R8G8_UINT},
+    {TextureFormat::R8G8, C, SINT, SINT, SINT, SINT, PixelFormat::R8G8_SINT},
 
-    {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S},
-    {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U},
-    {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F},
-    {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI},
+    {TextureFormat::R16G16B16A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16B16A16_SNORM},
+    {TextureFormat::R16G16B16A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16B16A16_UNORM},
+    {TextureFormat::R16G16B16A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16B16A16_FLOAT},
+    {TextureFormat::R16G16B16A16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16B16A16_UINT},
+    {TextureFormat::R16G16B16A16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16B16A16_SINT},
 
-    {TextureFormat::R16_G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RG16F},
-    {TextureFormat::R16_G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG16},
-    {TextureFormat::R16_G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG16S},
-    {TextureFormat::R16_G16, C, UINT, UINT, UINT, UINT, PixelFormat::RG16UI},
-    {TextureFormat::R16_G16, C, SINT, SINT, SINT, SINT, PixelFormat::RG16I},
+    {TextureFormat::R16G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16_FLOAT},
+    {TextureFormat::R16G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16_UNORM},
+    {TextureFormat::R16G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16_SNORM},
+    {TextureFormat::R16G16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16_UINT},
+    {TextureFormat::R16G16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16_SINT},
 
-    {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16F},
-    {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16U},
-    {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16S},
-    {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16UI},
-    {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16I},
+    {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16_FLOAT},
+    {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16_UNORM},
+    {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16_SNORM},
+    {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16_UINT},
+    {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16_SINT},
 
-    {TextureFormat::BF10GF11RF11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R11FG11FB10F},
+    {TextureFormat::B10G11R11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::B10G11R11_FLOAT},
 
-    {TextureFormat::R32_G32_B32_A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA32F},
-    {TextureFormat::R32_G32_B32_A32, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA32UI},
+    {TextureFormat::R32G32B32A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32A32_FLOAT},
+    {TextureFormat::R32G32B32A32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32B32A32_UINT},
+    {TextureFormat::R32G32B32A32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32B32A32_SINT},
 
-    {TextureFormat::R32_G32_B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGB32F},
+    {TextureFormat::R32G32B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32_FLOAT},
 
-    {TextureFormat::R32_G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RG32F},
-    {TextureFormat::R32_G32, C, UINT, UINT, UINT, UINT, PixelFormat::RG32UI},
+    {TextureFormat::R32G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32_FLOAT},
+    {TextureFormat::R32G32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32_UINT},
+    {TextureFormat::R32G32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32_SINT},
 
-    {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32F},
-    {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32UI},
-    {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32I},
+    {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32_FLOAT},
+    {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32_UINT},
+    {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32_SINT},
 
-    {TextureFormat::E5B9G9R9_SHAREDEXP, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9F},
+    {TextureFormat::E5B9G9R9, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9_FLOAT},
 
-    {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
-    {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
-    {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
-    {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
+    {TextureFormat::D32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::D32_FLOAT},
+    {TextureFormat::D16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::D16_UNORM},
+    {TextureFormat::S8D24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM},
+    {TextureFormat::R8G24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM},
+    {TextureFormat::D32S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::D32_FLOAT_S8_UINT},
 
-    {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
-    {TextureFormat::DXT1, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1_SRGB},
+    {TextureFormat::BC1_RGBA, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_UNORM},
+    {TextureFormat::BC1_RGBA, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_SRGB},
 
-    {TextureFormat::DXT23, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT23},
-    {TextureFormat::DXT23, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT23_SRGB},
+    {TextureFormat::BC2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_UNORM},
+    {TextureFormat::BC2, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_SRGB},
 
-    {TextureFormat::DXT45, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT45},
-    {TextureFormat::DXT45, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT45_SRGB},
+    {TextureFormat::BC3, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_UNORM},
+    {TextureFormat::BC3, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_SRGB},
 
-    // TODO: Use a different pixel format for SNORM
-    {TextureFormat::DXN1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXN1},
-    {TextureFormat::DXN1, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::DXN1},
+    {TextureFormat::BC4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC4_UNORM},
+    {TextureFormat::BC4, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC4_SNORM},
 
-    {TextureFormat::DXN2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXN2UNORM},
-    {TextureFormat::DXN2, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::DXN2SNORM},
+    {TextureFormat::BC5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC5_UNORM},
+    {TextureFormat::BC5, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC5_SNORM},
 
-    {TextureFormat::BC7U, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7U},
-    {TextureFormat::BC7U, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7U_SRGB},
+    {TextureFormat::BC7, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_UNORM},
+    {TextureFormat::BC7, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_SRGB},
 
-    {TextureFormat::BC6H_SF16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SF16},
-    {TextureFormat::BC6H_UF16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UF16},
+    {TextureFormat::BC6H_SFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SFLOAT},
+    {TextureFormat::BC6H_UFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UFLOAT},
 
-    {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4},
+    {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_UNORM},
     {TextureFormat::ASTC_2D_4X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_SRGB},
 
-    {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4},
+    {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_UNORM},
     {TextureFormat::ASTC_2D_5X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_SRGB},
 
-    {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5},
+    {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_UNORM},
     {TextureFormat::ASTC_2D_5X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_SRGB},
 
-    {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8},
+    {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_UNORM},
     {TextureFormat::ASTC_2D_8X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_SRGB},
 
-    {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5},
+    {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_UNORM},
     {TextureFormat::ASTC_2D_8X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_SRGB},
 
-    {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8},
+    {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_UNORM},
     {TextureFormat::ASTC_2D_10X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_SRGB},
 
-    {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6},
+    {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_UNORM},
     {TextureFormat::ASTC_2D_6X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_SRGB},
 
-    {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10},
+    {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_UNORM},
     {TextureFormat::ASTC_2D_10X10, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_SRGB},
 
-    {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12},
+    {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_UNORM},
     {TextureFormat::ASTC_2D_12X12, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_SRGB},
 
-    {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6},
+    {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_UNORM},
     {TextureFormat::ASTC_2D_8X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_SRGB},
 
-    {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5},
+    {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_UNORM},
     {TextureFormat::ASTC_2D_6X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_SRGB},
 }};
 
@@ -182,7 +189,7 @@ PixelFormat FormatLookupTable::GetPixelFormat(TextureFormat format, bool is_srgb
                       static_cast<int>(format), is_srgb, static_cast<int>(red_component),
                       static_cast<int>(green_component), static_cast<int>(blue_component),
                       static_cast<int>(alpha_component));
-    return PixelFormat::ABGR8U;
+    return PixelFormat::A8B8G8R8_UNORM;
 }
 
 void FormatLookupTable::Set(TextureFormat format, bool is_srgb, ComponentType red_component,
@@ -196,9 +203,9 @@ std::size_t FormatLookupTable::CalculateIndex(TextureFormat format, bool is_srgb
                                               ComponentType alpha_component) noexcept {
     const auto format_index = static_cast<std::size_t>(format);
     const auto red_index = static_cast<std::size_t>(red_component);
-    const auto green_index = static_cast<std::size_t>(red_component);
-    const auto blue_index = static_cast<std::size_t>(red_component);
-    const auto alpha_index = static_cast<std::size_t>(red_component);
+    const auto green_index = static_cast<std::size_t>(green_component);
+    const auto blue_index = static_cast<std::size_t>(blue_component);
+    const auto alpha_index = static_cast<std::size_t>(alpha_component);
     const std::size_t srgb_index = is_srgb ? 1 : 0;
 
     return format_index * PerFormat +
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 7af0e792c..b44c09d71 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -115,17 +115,24 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     if (gpu_addr == candidate_gpu_addr) {
         return {{0, 0}};
     }
+
     if (candidate_gpu_addr < gpu_addr) {
-        return {};
+        return std::nullopt;
     }
+
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
+    if (layer >= params.depth) {
+        return std::nullopt;
+    }
+
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
     if (mipmap_it == mipmap_offsets.end()) {
-        return {};
+        return std::nullopt;
     }
+
     const auto level{static_cast<u32>(std::distance(mipmap_offsets.begin(), mipmap_it))};
     return std::make_pair(layer, level);
 }
@@ -225,7 +232,7 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
         }
     }
 
-    if (!is_converted && params.pixel_format != PixelFormat::S8Z24) {
+    if (!is_converted && params.pixel_format != PixelFormat::S8_UINT_D24_UNORM) {
         return;
     }
 
@@ -251,6 +258,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
     tmp_buffer.resize(guest_memory_size);
     host_ptr = tmp_buffer.data();
 
+    if (params.target == SurfaceTarget::Texture3D) {
+        // Special case for 3D texture segments
+        memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
+    }
+
     if (params.is_tiled) {
         ASSERT_MSG(params.block_width == 0, "Block width is defined as {}", params.block_width);
         for (u32 level = 0; level < params.num_levels; ++level) {
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index a39a8661b..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -72,9 +72,9 @@ public:
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) {
+    bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) const {
         const GPUVAddr gpu_addr_end = gpu_addr + guest_memory_size;
-        return (gpu_addr <= other_start && other_end <= gpu_addr_end);
+        return gpu_addr <= other_start && other_end <= gpu_addr_end;
     }
 
     // Use only when recycling a surface
@@ -192,6 +192,22 @@ public:
         index = index_;
     }
 
+    void SetMemoryMarked(bool is_memory_marked_) {
+        is_memory_marked = is_memory_marked_;
+    }
+
+    bool IsMemoryMarked() const {
+        return is_memory_marked;
+    }
+
+    void SetSyncPending(bool is_sync_pending_) {
+        is_sync_pending = is_sync_pending_;
+    }
+
+    bool IsSyncPending() const {
+        return is_sync_pending;
+    }
+
     void MarkAsPicked(bool is_picked_) {
         is_picked = is_picked_;
     }
@@ -201,8 +217,8 @@ public:
     }
 
     bool IsProtected() const {
-        // Only 3D Slices are to be protected
-        return is_target && params.block_depth > 0;
+        // Only 3D slices are to be protected
+        return is_target && params.target == SurfaceTarget::Texture3D;
     }
 
     bool IsRenderTarget() const {
@@ -234,6 +250,11 @@ public:
         return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
     }
 
+    TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
+        return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
+                                  base_level, num_levels));
+    }
+
     std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
                                               const GPUVAddr view_addr,
                                               const std::size_t candidate_size, const u32 mipmap,
@@ -256,8 +277,8 @@ public:
     std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
                                      const std::size_t candidate_size) {
         if (params.target == SurfaceTarget::Texture3D ||
-            (params.num_levels == 1 && !params.is_layered) ||
-            view_params.target == SurfaceTarget::Texture3D) {
+            view_params.target == SurfaceTarget::Texture3D ||
+            (params.num_levels == 1 && !params.is_layered)) {
             return {};
         }
         const auto layer_mipmap{GetLayerMipmap(view_addr)};
@@ -303,6 +324,8 @@ private:
     bool is_target{};
     bool is_registered{};
     bool is_picked{};
+    bool is_memory_marked{};
+    bool is_sync_pending{};
     u32 index{NO_RT};
     u64 modification_tick{};
 };
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 6f3ef45be..13dd16356 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -74,21 +74,21 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
     SurfaceParams params;
     params.is_tiled = tic.IsTiled();
     params.srgb_conversion = tic.IsSrgbConversionEnabled();
-    params.block_width = params.is_tiled ? tic.BlockWidth() : 0,
-    params.block_height = params.is_tiled ? tic.BlockHeight() : 0,
-    params.block_depth = params.is_tiled ? tic.BlockDepth() : 0,
+    params.block_width = params.is_tiled ? tic.BlockWidth() : 0;
+    params.block_height = params.is_tiled ? tic.BlockHeight() : 0;
+    params.block_depth = params.is_tiled ? tic.BlockDepth() : 0;
     params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1;
     params.pixel_format = lookup_table.GetPixelFormat(
         tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type);
     params.type = GetFormatType(params.pixel_format);
-    if (entry.IsShadow() && params.type == SurfaceType::ColorTexture) {
+    if (entry.is_shadow && params.type == SurfaceType::ColorTexture) {
         switch (params.pixel_format) {
-        case PixelFormat::R16U:
-        case PixelFormat::R16F:
-            params.pixel_format = PixelFormat::Z16;
+        case PixelFormat::R16_UNORM:
+        case PixelFormat::R16_FLOAT:
+            params.pixel_format = PixelFormat::D16_UNORM;
             break;
-        case PixelFormat::R32F:
-            params.pixel_format = PixelFormat::Z32F;
+        case PixelFormat::R32_FLOAT:
+            params.pixel_format = PixelFormat::D32_FLOAT;
             break;
         default:
             UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}",
@@ -96,7 +96,6 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
         }
         params.type = GetFormatType(params.pixel_format);
     }
-    params.type = GetFormatType(params.pixel_format);
     // TODO: on 1DBuffer we should use the tic info.
     if (tic.IsBuffer()) {
         params.target = SurfaceTarget::TextureBuffer;
@@ -108,7 +107,7 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
         params.emulated_levels = 1;
         params.is_layered = false;
     } else {
-        params.target = TextureTypeToSurfaceTarget(entry.GetType(), entry.IsArray());
+        params.target = TextureTypeToSurfaceTarget(entry.type, entry.is_array);
         params.width = tic.Width();
         params.height = tic.Height();
         params.depth = tic.Depth();
@@ -130,15 +129,14 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl
     SurfaceParams params;
     params.is_tiled = tic.IsTiled();
     params.srgb_conversion = tic.IsSrgbConversionEnabled();
-    params.block_width = params.is_tiled ? tic.BlockWidth() : 0,
-    params.block_height = params.is_tiled ? tic.BlockHeight() : 0,
-    params.block_depth = params.is_tiled ? tic.BlockDepth() : 0,
+    params.block_width = params.is_tiled ? tic.BlockWidth() : 0;
+    params.block_height = params.is_tiled ? tic.BlockHeight() : 0;
+    params.block_depth = params.is_tiled ? tic.BlockDepth() : 0;
     params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1;
     params.pixel_format = lookup_table.GetPixelFormat(
         tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type);
     params.type = GetFormatType(params.pixel_format);
-    params.type = GetFormatType(params.pixel_format);
-    params.target = ImageTypeToSurfaceTarget(entry.GetType());
+    params.target = ImageTypeToSurfaceTarget(entry.type);
     // TODO: on 1DBuffer we should use the tic info.
     if (tic.IsBuffer()) {
         params.target = SurfaceTarget::TextureBuffer;
@@ -165,39 +163,40 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl
     return params;
 }
 
-SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    regs.zeta_width, regs.zeta_height, regs.zeta.format, regs.zeta.memory_layout.type;
-    SurfaceParams params;
-    params.is_tiled = regs.zeta.memory_layout.type ==
-                      Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
-    params.srgb_conversion = false;
-    params.block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U);
-    params.block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U);
-    params.block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U);
-    params.tile_width_spacing = 1;
-    params.pixel_format = PixelFormatFromDepthFormat(regs.zeta.format);
-    params.type = GetFormatType(params.pixel_format);
-    params.width = regs.zeta_width;
-    params.height = regs.zeta_height;
-    params.pitch = 0;
-    params.num_levels = 1;
-    params.emulated_levels = 1;
-
-    const bool is_layered = regs.zeta_layers > 1 && params.block_depth == 0;
-    params.is_layered = is_layered;
-    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
-    params.depth = is_layered ? regs.zeta_layers.Value() : 1U;
-    return params;
+SurfaceParams SurfaceParams::CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d) {
+    const auto& regs = maxwell3d.regs;
+    const auto block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U);
+    const bool is_layered = regs.zeta_layers > 1 && block_depth == 0;
+    const auto pixel_format = PixelFormatFromDepthFormat(regs.zeta.format);
+    return {
+        .is_tiled = regs.zeta.memory_layout.type ==
+                    Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear,
+        .srgb_conversion = false,
+        .is_layered = is_layered,
+        .block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U),
+        .block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U),
+        .block_depth = block_depth,
+        .tile_width_spacing = 1,
+        .width = regs.zeta_width,
+        .height = regs.zeta_height,
+        .depth = is_layered ? regs.zeta_layers.Value() : 1U,
+        .pitch = 0,
+        .num_levels = 1,
+        .emulated_levels = 1,
+        .pixel_format = pixel_format,
+        .type = GetFormatType(pixel_format),
+        .target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D,
+    };
 }
 
-SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::size_t index) {
-    const auto& config{system.GPU().Maxwell3D().regs.rt[index]};
+SurfaceParams SurfaceParams::CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d,
+                                                  std::size_t index) {
+    const auto& config{maxwell3d.regs.rt[index]};
     SurfaceParams params;
     params.is_tiled =
         config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
-    params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB ||
-                             config.format == Tegra::RenderTargetFormat::RGBA8_SRGB;
+    params.srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB ||
+                             config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB;
     params.block_width = config.memory_layout.block_width;
     params.block_height = config.memory_layout.block_height;
     params.block_depth = config.memory_layout.block_depth;
@@ -216,45 +215,60 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
     params.num_levels = 1;
     params.emulated_levels = 1;
 
-    const bool is_layered = config.layers > 1 && params.block_depth == 0;
-    params.is_layered = is_layered;
-    params.depth = is_layered ? config.layers.Value() : 1;
-    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+    if (config.memory_layout.is_3d != 0) {
+        params.depth = config.layers.Value();
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture3D;
+    } else if (config.layers > 1) {
+        params.depth = config.layers.Value();
+        params.is_layered = true;
+        params.target = SurfaceTarget::Texture2DArray;
+    } else {
+        params.depth = 1;
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture2D;
+    }
     return params;
 }
 
 SurfaceParams SurfaceParams::CreateForFermiCopySurface(
     const Tegra::Engines::Fermi2D::Regs::Surface& config) {
-    SurfaceParams params{};
-    params.is_tiled = !config.linear;
-    params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB ||
-                             config.format == Tegra::RenderTargetFormat::RGBA8_SRGB;
-    params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 5U) : 0,
-    params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 5U) : 0,
-    params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 5U) : 0,
-    params.tile_width_spacing = 1;
-    params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
-    params.type = GetFormatType(params.pixel_format);
-    params.width = config.width;
-    params.height = config.height;
-    params.pitch = config.pitch;
-    // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
-    params.target = SurfaceTarget::Texture2D;
-    params.depth = 1;
-    params.num_levels = 1;
-    params.emulated_levels = 1;
+    const bool is_tiled = !config.linear;
+    const auto pixel_format = PixelFormatFromRenderTargetFormat(config.format);
+
+    SurfaceParams params{
+        .is_tiled = is_tiled,
+        .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB ||
+                           config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB,
+        .is_layered = false,
+        .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U,
+        .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U,
+        .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U,
+        .tile_width_spacing = 1,
+        .width = config.width,
+        .height = config.height,
+        .depth = 1,
+        .pitch = config.pitch,
+        .num_levels = 1,
+        .emulated_levels = 1,
+        .pixel_format = pixel_format,
+        .type = GetFormatType(pixel_format),
+        // TODO(Rodrigo): Try to guess texture arrays from parameters
+        .target = SurfaceTarget::Texture2D,
+    };
+
     params.is_layered = params.IsLayered();
     return params;
 }
 
 VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget(
     const VideoCommon::Shader::Sampler& entry) {
-    return TextureTypeToSurfaceTarget(entry.GetType(), entry.IsArray());
+    return TextureTypeToSurfaceTarget(entry.type, entry.is_array);
 }
 
 VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget(
     const VideoCommon::Shader::Image& entry) {
-    return ImageTypeToSurfaceTarget(entry.GetType());
+    return ImageTypeToSurfaceTarget(entry.type);
 }
 
 bool SurfaceParams::IsLayered() const {
@@ -335,8 +349,7 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co
         size += GetInnerMipmapMemorySize(level, as_host_size, uncompressed);
     }
     if (is_tiled && is_layered) {
-        return Common::AlignBits(size,
-                                 Tegra::Texture::GetGOBSizeShift() + block_height + block_depth);
+        return Common::AlignBits(size, Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth);
     }
     return size;
 }
@@ -410,7 +423,7 @@ std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const {
     const u32 block_size = GetBlockSize();
     const u32 block_index = offset / block_size;
     const u32 gob_offset = offset % block_size;
-    const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GetGOBSize());
+    const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GOB_SIZE);
     const u32 x_gob_pixels = 64U / GetBytesPerPixel();
     const u32 x_block_pixels = x_gob_pixels << block_width;
     const u32 y_block_pixels = 8U << block_height;
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 24957df8d..4466c3c34 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -33,10 +33,11 @@ public:
                                         const VideoCommon::Shader::Image& entry);
 
     /// Creates SurfaceCachedParams for a depth buffer configuration.
-    static SurfaceParams CreateForDepthBuffer(Core::System& system);
+    static SurfaceParams CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d);
 
     /// Creates SurfaceCachedParams from a framebuffer configuration.
-    static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index);
+    static SurfaceParams CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d,
+                                              std::size_t index);
 
     /// Creates SurfaceCachedParams from a Fermi2D surface configuration.
     static SurfaceParams CreateForFermiCopySurface(
@@ -204,7 +205,7 @@ public:
     static std::size_t AlignLayered(const std::size_t out_size, const u32 block_height,
                                     const u32 block_depth) {
         return Common::AlignBits(out_size,
-                                 Tegra::Texture::GetGOBSizeShift() + block_height + block_depth);
+                                 Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth);
     }
 
     /// Converts a width from a type of surface into another. This helps represent the
diff --git a/src/video_core/texture_cache/surface_view.cpp b/src/video_core/texture_cache/surface_view.cpp
index 57a1f5803..6b5f5984b 100644
--- a/src/video_core/texture_cache/surface_view.cpp
+++ b/src/video_core/texture_cache/surface_view.cpp
@@ -20,4 +20,8 @@ bool ViewParams::operator==(const ViewParams& rhs) const {
            std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels, rhs.target);
 }
 
+bool ViewParams::operator!=(const ViewParams& rhs) const {
+    return !operator==(rhs);
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/texture_cache/surface_view.h b/src/video_core/texture_cache/surface_view.h
index b17fd11a9..90a8bb0ae 100644
--- a/src/video_core/texture_cache/surface_view.h
+++ b/src/video_core/texture_cache/surface_view.h
@@ -21,6 +21,7 @@ struct ViewParams {
     std::size_t Hash() const;
 
     bool operator==(const ViewParams& rhs) const;
+    bool operator!=(const ViewParams& rhs) const;
 
     bool IsLayered() const {
         switch (target) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 4edd4313b..ea835c59f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <array>
+#include <list>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -13,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <boost/range/iterator_range.hpp>
 
@@ -22,6 +24,7 @@
 #include "core/core.h"
 #include "core/memory.h"
 #include "core/settings.h"
+#include "video_core/compatible_formats.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -45,13 +48,14 @@ class RasterizerInterface;
 
 namespace VideoCommon {
 
+using VideoCore::Surface::FormatCompatibility;
 using VideoCore::Surface::PixelFormat;
-
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
 template <typename TSurface, typename TView>
 class TextureCache {
+    using VectorSurface = boost::container::small_vector<TSurface, 1>;
 
 public:
     void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -62,6 +66,30 @@ public:
         }
     }
 
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        for (const auto& surface : GetSurfacesInRegion(addr, size)) {
+            if (surface->IsMemoryMarked()) {
+                UnmarkMemory(surface);
+                surface->SetSyncPending(true);
+                marked_for_unregister.emplace_back(surface);
+            }
+        }
+    }
+
+    void SyncGuestHost() {
+        std::lock_guard lock{mutex};
+
+        for (const auto& surface : marked_for_unregister) {
+            if (surface->IsRegistered()) {
+                surface->SetSyncPending(false);
+                Unregister(surface);
+            }
+        }
+        marked_for_unregister.clear();
+    }
+
     /**
      * Guarantees that rendertargets don't unregister themselves if the
      * collide. Protection is currently only done on 3D slices.
@@ -85,10 +113,20 @@ public:
             return a->GetModificationTick() < b->GetModificationTick();
         });
         for (const auto& surface : surfaces) {
+            mutex.unlock();
             FlushSurface(surface);
+            mutex.lock();
         }
     }
 
+    bool MustFlushRegion(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        const auto surfaces = GetSurfacesInRegion(addr, size);
+        return std::any_of(surfaces.cbegin(), surfaces.cend(),
+                           [](const TSurface& surface) { return surface->IsModified(); });
+    }
+
     TView GetTextureSurface(const Tegra::Texture::TICEntry& tic,
                             const VideoCommon::Shader::Sampler& entry) {
         std::lock_guard lock{mutex};
@@ -97,8 +135,7 @@ public:
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
 
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
@@ -108,7 +145,7 @@ public:
         }
 
         const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)};
-        const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
+        const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false);
         if (guard_samplers) {
             sampled_textures.push_back(surface);
         }
@@ -122,13 +159,12 @@ public:
         if (!gpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
         }
         const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)};
-        const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false);
+        const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false);
         if (guard_samplers) {
             sampled_textures.push_back(surface);
         }
@@ -143,13 +179,13 @@ public:
         return any_rt;
     }
 
-    TView GetDepthBufferSurface() {
+    TView GetDepthBufferSurface(bool preserve_contents) {
         std::lock_guard lock{mutex};
-        auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
+        auto& dirty = maxwell3d.dirty;
+        if (!dirty.flags[VideoCommon::Dirty::ZetaBuffer]) {
             return depth_buffer.view;
         }
-        maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false;
+        dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false;
 
         const auto& regs{maxwell3d.regs};
         const auto gpu_addr{regs.zeta.Address()};
@@ -157,14 +193,13 @@ public:
             SetEmptyDepthBuffer();
             return {};
         }
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             SetEmptyDepthBuffer();
             return {};
         }
-        const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)};
-        auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, true);
+        const auto depth_params{SurfaceParams::CreateForDepthBuffer(maxwell3d)};
+        auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true);
         if (depth_buffer.target)
             depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
         depth_buffer.target = surface_view.first;
@@ -174,10 +209,9 @@ public:
         return surface_view.second;
     }
 
-    TView GetColorBufferSurface(std::size_t index) {
+    TView GetColorBufferSurface(std::size_t index, bool preserve_contents) {
         std::lock_guard lock{mutex};
         ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
-        auto& maxwell3d = system.GPU().Maxwell3D();
         if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) {
             return render_targets[index].view;
         }
@@ -197,17 +231,23 @@ public:
             return {};
         }
 
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             SetEmptyColorBuffer(index);
             return {};
         }
 
-        auto surface_view = GetSurface(gpu_addr, *cpu_addr,
-                                       SurfaceParams::CreateForFramebuffer(system, index), true);
-        if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
+        auto surface_view =
+            GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(maxwell3d, index),
+                       preserve_contents, true);
+        if (render_targets[index].target) {
+            auto& surface = render_targets[index].target;
+            surface->MarkAsRenderTarget(false, NO_RT);
+            const auto& cr_params = surface->GetSurfaceParams();
+            if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+                AsyncFlushSurface(surface);
+            }
+        }
         render_targets[index].target = surface_view.first;
         render_targets[index].view = surface_view.second;
         if (render_targets[index].target)
@@ -254,40 +294,69 @@ public:
         const GPUVAddr src_gpu_addr = src_config.Address();
         const GPUVAddr dst_gpu_addr = dst_config.Address();
         DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
-        const std::optional<VAddr> dst_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr);
-        const std::optional<VAddr> src_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
-        std::pair<TSurface, TView> dst_surface =
-            GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, false);
-        std::pair<TSurface, TView> src_surface =
-            GetSurface(src_gpu_addr, *src_cpu_addr, src_params, false);
-        ImageBlit(src_surface.second, dst_surface.second, copy_config);
+
+        const std::optional<VAddr> dst_cpu_addr = gpu_memory.GpuToCpuAddress(dst_gpu_addr);
+        const std::optional<VAddr> src_cpu_addr = gpu_memory.GpuToCpuAddress(src_gpu_addr);
+        std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+        TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
+        ImageBlit(src_surface, dst_surface.second, copy_config);
         dst_surface.first->MarkAsModified(true, Tick());
     }
 
-    TSurface TryFindFramebufferSurface(VAddr addr) {
+    TSurface TryFindFramebufferSurface(VAddr addr) const {
         if (!addr) {
             return nullptr;
         }
         const VAddr page = addr >> registry_page_bits;
-        std::vector<TSurface>& list = registry[page];
-        for (auto& surface : list) {
-            if (surface->GetCpuAddr() == addr) {
-                return surface;
-            }
+        const auto it = registry.find(page);
+        if (it == registry.end()) {
+            return nullptr;
         }
-        return nullptr;
+        const auto& list = it->second;
+        const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
+            return surface->GetCpuAddr() == addr;
+        });
+        return found != list.end() ? *found : nullptr;
     }
 
     u64 Tick() {
         return ++ticks;
     }
 
+    void CommitAsyncFlushes() {
+        committed_flushes.push_back(uncommitted_flushes);
+        uncommitted_flushes.reset();
+    }
+
+    bool HasUncommittedFlushes() const {
+        return uncommitted_flushes != nullptr;
+    }
+
+    bool ShouldWaitAsyncFlushes() const {
+        return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+    }
+
+    void PopAsyncFlushes() {
+        if (committed_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = committed_flushes.front();
+        if (!flush_list) {
+            committed_flushes.pop_front();
+            return;
+        }
+        for (TSurface& surface : *flush_list) {
+            FlushSurface(surface);
+        }
+        committed_flushes.pop_front();
+    }
+
 protected:
-    explicit TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                          bool is_astc_supported)
-        : system{system}, is_astc_supported{is_astc_supported}, rasterizer{rasterizer} {
+    explicit TextureCache(VideoCore::RasterizerInterface& rasterizer_,
+                          Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_,
+                          bool is_astc_supported_)
+        : is_astc_supported{is_astc_supported_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+          gpu_memory{gpu_memory_} {
         for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
             SetEmptyColorBuffer(i);
         }
@@ -300,9 +369,9 @@ protected:
             siblings_table[static_cast<std::size_t>(b)] = a;
         };
         std::fill(siblings_table.begin(), siblings_table.end(), PixelFormat::Invalid);
-        make_siblings(PixelFormat::Z16, PixelFormat::R16U);
-        make_siblings(PixelFormat::Z32F, PixelFormat::R32F);
-        make_siblings(PixelFormat::Z32FS8, PixelFormat::RG32F);
+        make_siblings(PixelFormat::D16_UNORM, PixelFormat::R16_UNORM);
+        make_siblings(PixelFormat::D32_FLOAT, PixelFormat::R32_FLOAT);
+        make_siblings(PixelFormat::D32_FLOAT_S8_UINT, PixelFormat::R32G32_FLOAT);
 
         sampled_textures.reserve(64);
     }
@@ -322,7 +391,7 @@ protected:
     virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;
 
     void ManageRenderTargetUnregister(TSurface& surface) {
-        auto& dirty = system.GPU().Maxwell3D().dirty;
+        auto& dirty = maxwell3d.dirty;
         const u32 index = surface->GetRenderTarget();
         if (index == DEPTH_RT) {
             dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true;
@@ -335,8 +404,7 @@ protected:
     void Register(TSurface surface) {
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const std::size_t size = surface->GetSizeInBytes();
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr) {
             LOG_CRITICAL(HW_GPU, "Failed to register surface with unmapped gpu_address 0x{:016x}",
                          gpu_addr);
@@ -345,9 +413,20 @@ protected:
         surface->SetCpuAddr(*cpu_addr);
         RegisterInnerCache(surface);
         surface->MarkAsRegistered(true);
+        surface->SetMemoryMarked(true);
         rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
     }
 
+    void UnmarkMemory(TSurface surface) {
+        if (!surface->IsMemoryMarked()) {
+            return;
+        }
+        const std::size_t size = surface->GetSizeInBytes();
+        const VAddr cpu_addr = surface->GetCpuAddr();
+        rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
+        surface->SetMemoryMarked(false);
+    }
+
     void Unregister(TSurface surface) {
         if (guard_render_targets && surface->IsProtected()) {
             return;
@@ -355,9 +434,11 @@ protected:
         if (!guard_render_targets && surface->IsRenderTarget()) {
             ManageRenderTargetUnregister(surface);
         }
-        const std::size_t size = surface->GetSizeInBytes();
-        const VAddr cpu_addr = surface->GetCpuAddr();
-        rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
+        UnmarkMemory(surface);
+        if (surface->IsSyncPending()) {
+            marked_for_unregister.remove(surface);
+            surface->SetSyncPending(false);
+        }
         UnregisterInnerCache(surface);
         surface->MarkAsRegistered(false);
         ReserveSurface(surface->GetSurfaceParams(), surface);
@@ -373,7 +454,6 @@ protected:
         return new_surface;
     }
 
-    Core::System& system;
     const bool is_astc_supported;
 
 private:
@@ -415,18 +495,18 @@ private:
      * @param untopological Indicates to the recycler that the texture has no way
      *                      to match the overlaps due to topological reasons.
      **/
-    RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
+    RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
                                  const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
-        if (Settings::values.use_accurate_gpu_emulation) {
+        if (Settings::IsGPULevelExtreme()) {
             return RecycleStrategy::Flush;
         }
         // 3D Textures decision
-        if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) {
+        if (params.target == SurfaceTarget::Texture3D) {
             return RecycleStrategy::Flush;
         }
         for (const auto& s : overlaps) {
             const auto& s_params = s->GetSurfaceParams();
-            if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) {
+            if (s_params.target == SurfaceTarget::Texture3D) {
                 return RecycleStrategy::Flush;
             }
         }
@@ -450,18 +530,21 @@ private:
      * @param overlaps          The overlapping surfaces registered in the cache.
      * @param params            The parameters for the new surface.
      * @param gpu_addr          The starting address of the new surface.
+     * @param preserve_contents Indicates that the new surface should be loaded from memory or left
+     *                          blank.
      * @param untopological     Indicates to the recycler that the texture has no way to match the
      *                          overlaps due to topological reasons.
      **/
-    std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
-                                              const SurfaceParams& params, const GPUVAddr gpu_addr,
+    std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
+                                              const GPUVAddr gpu_addr, const bool preserve_contents,
                                               const MatchTopologyResult untopological) {
+        const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
         for (auto& surface : overlaps) {
             Unregister(surface);
         }
         switch (PickStrategy(overlaps, params, gpu_addr, untopological)) {
         case RecycleStrategy::Ignore: {
-            return InitializeSurface(gpu_addr, params, Settings::values.use_accurate_gpu_emulation);
+            return InitializeSurface(gpu_addr, params, do_load);
         }
         case RecycleStrategy::Flush: {
             std::sort(overlaps.begin(), overlaps.end(),
@@ -471,7 +554,7 @@ private:
             for (auto& surface : overlaps) {
                 FlushSurface(surface);
             }
-            return InitializeSurface(gpu_addr, params);
+            return InitializeSurface(gpu_addr, params, preserve_contents);
         }
         case RecycleStrategy::BufferCopy: {
             auto new_surface = GetUncachedSurface(gpu_addr, params);
@@ -480,7 +563,7 @@ private:
         }
         default: {
             UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!");
-            return InitializeSurface(gpu_addr, params);
+            return InitializeSurface(gpu_addr, params, do_load);
         }
         }
     }
@@ -507,15 +590,15 @@ private:
         } else {
             new_surface = GetUncachedSurface(gpu_addr, params);
         }
-        const auto& final_params = new_surface->GetSurfaceParams();
+        const SurfaceParams& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
-            if (Settings::values.use_accurate_gpu_emulation) {
+            if (Settings::IsGPULevelExtreme()) {
                 BufferCopy(current_surface, new_surface);
             }
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
-                ImageCopy(current_surface, new_surface, brick);
+                TryCopyImage(current_surface, new_surface, brick);
             }
         }
         Unregister(current_surface);
@@ -563,47 +646,65 @@ private:
      * @param params   The parameters on the new surface.
      * @param gpu_addr The starting address of the new surface.
      **/
-    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
                                                                     const SurfaceParams& params,
-                                                                    const GPUVAddr gpu_addr) {
+                                                                    GPUVAddr gpu_addr) {
         if (params.target == SurfaceTarget::Texture3D) {
-            return {};
+            return std::nullopt;
         }
-        bool modified = false;
+        const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
         TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-        u32 passed_tests = 0;
+
+        if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
+            LoadSurface(new_surface);
+            for (const auto& surface : overlaps) {
+                Unregister(surface);
+            }
+            Register(new_surface);
+            return {{new_surface, new_surface->GetMainView()}};
+        }
+
+        std::size_t passed_tests = 0;
         for (auto& surface : overlaps) {
             const SurfaceParams& src_params = surface->GetSurfaceParams();
-            if (src_params.is_layered || src_params.num_levels > 1) {
-                // We send this cases to recycle as they are more complex to handle
-                return {};
-            }
-            const std::size_t candidate_size = surface->GetSizeInBytes();
-            auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
+            const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
             if (!mipmap_layer) {
                 continue;
             }
-            const auto [layer, mipmap] = *mipmap_layer;
-            if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
+            const auto [base_layer, base_mipmap] = *mipmap_layer;
+            if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
                 continue;
             }
-            modified |= surface->IsModified();
-            // Now we got all the data set up
-            const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
-            const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
-            const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
-            passed_tests++;
-            ImageCopy(surface, new_surface, copy_params);
+            ++passed_tests;
+
+            // Copy all mipmaps and layers
+            const u32 block_width = params.GetDefaultBlockWidth();
+            const u32 block_height = params.GetDefaultBlockHeight();
+            for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
+                const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
+                const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
+                if (width < block_width || height < block_height) {
+                    // Current APIs forbid copying small compressed textures, avoid errors
+                    break;
+                }
+                const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
+                                             src_params.depth);
+                TryCopyImage(surface, new_surface, copy_params);
+            }
         }
         if (passed_tests == 0) {
-            return {};
+            return std::nullopt;
+        }
+        if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
             // In Accurate GPU all tests should pass, else we recycle
-        } else if (Settings::values.use_accurate_gpu_emulation && passed_tests != overlaps.size()) {
-            return {};
+            return std::nullopt;
         }
+
+        const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
         for (const auto& surface : overlaps) {
             Unregister(surface);
         }
+
         new_surface->MarkAsModified(modified, Tick());
         Register(new_surface);
         return {{new_surface, new_surface->GetMainView()}};
@@ -614,64 +715,26 @@ private:
      * textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of
      * the HLE methods.
      *
-     * @param overlaps          The overlapping surfaces registered in the cache.
-     * @param params            The parameters on the new surface.
-     * @param gpu_addr          The starting address of the new surface.
-     * @param cache_addr        The starting address of the new surface on physical memory.
+     * @param overlaps  The overlapping surfaces registered in the cache.
+     * @param params    The parameters on the new surface.
+     * @param gpu_addr  The starting address of the new surface.
+     * @param cpu_addr  The starting address of the new surface on physical memory.
+     * @param preserve_contents Indicates that the new surface should be loaded from memory or
+     *                          left blank.
      */
-    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
                                                                const SurfaceParams& params,
-                                                               const GPUVAddr gpu_addr,
-                                                               const VAddr cpu_addr) {
-        if (params.target == SurfaceTarget::Texture3D) {
-            bool failed = false;
-            if (params.num_levels > 1) {
-                // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
-                return std::nullopt;
-            }
-            TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-            bool modified = false;
-            for (auto& surface : overlaps) {
-                const SurfaceParams& src_params = surface->GetSurfaceParams();
-                if (src_params.target != SurfaceTarget::Texture2D) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.height != params.height) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.block_depth != params.block_depth ||
-                    src_params.block_height != params.block_height) {
-                    failed = true;
-                    break;
-                }
-                const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
-                const auto [x, y, z] = params.GetBlockOffsetXYZ(offset);
-                modified |= surface->IsModified();
-                const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
-                                             1);
-                ImageCopy(surface, new_surface, copy_params);
-            }
-            if (failed) {
-                return std::nullopt;
-            }
-            for (const auto& surface : overlaps) {
-                Unregister(surface);
-            }
-            new_surface->MarkAsModified(modified, Tick());
-            Register(new_surface);
-            auto view = new_surface->GetMainView();
-            return {{std::move(new_surface), view}};
-        } else {
+                                                               GPUVAddr gpu_addr, VAddr cpu_addr,
+                                                               bool preserve_contents) {
+        if (params.target != SurfaceTarget::Texture3D) {
             for (const auto& surface : overlaps) {
                 if (!surface->MatchTarget(params.target)) {
                     if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
-                        if (Settings::values.use_accurate_gpu_emulation) {
+                        if (Settings::IsGPULevelExtreme()) {
                             return std::nullopt;
                         }
                         Unregister(surface);
-                        return InitializeSurface(gpu_addr, params);
+                        return InitializeSurface(gpu_addr, params, preserve_contents);
                     }
                     return std::nullopt;
                 }
@@ -679,11 +742,60 @@ private:
                     continue;
                 }
                 if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
-                    return {{surface, surface->GetMainView()}};
+                    return std::make_pair(surface, surface->GetMainView());
+                }
+            }
+            return InitializeSurface(gpu_addr, params, preserve_contents);
+        }
+
+        if (params.num_levels > 1) {
+            // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
+            return std::nullopt;
+        }
+
+        if (overlaps.size() == 1) {
+            const auto& surface = overlaps[0];
+            const SurfaceParams& overlap_params = surface->GetSurfaceParams();
+            // Don't attempt to render to textures with more than one level for now
+            // The texture has to be to the right or the sample address if we want to render to it
+            if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
+                const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
+                const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+                if (slice < overlap_params.depth) {
+                    auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
+                    return std::make_pair(std::move(surface), std::move(view));
                 }
             }
-            return InitializeSurface(gpu_addr, params);
         }
+
+        TSurface new_surface = GetUncachedSurface(gpu_addr, params);
+        bool modified = false;
+
+        for (auto& surface : overlaps) {
+            const SurfaceParams& src_params = surface->GetSurfaceParams();
+            if (src_params.target != SurfaceTarget::Texture2D ||
+                src_params.height != params.height ||
+                src_params.block_depth != params.block_depth ||
+                src_params.block_height != params.block_height) {
+                return std::nullopt;
+            }
+            modified |= surface->IsModified();
+
+            const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
+            const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+            const u32 width = params.width;
+            const u32 height = params.height;
+            const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
+            TryCopyImage(surface, new_surface, copy_params);
+        }
+        for (const auto& surface : overlaps) {
+            Unregister(surface);
+        }
+        new_surface->MarkAsModified(modified, Tick());
+        Register(new_surface);
+
+        TView view = new_surface->GetMainView();
+        return std::make_pair(std::move(new_surface), std::move(view));
     }
 
     /**
@@ -705,10 +817,13 @@ private:
      *
      * @param gpu_addr          The starting address of the candidate surface.
      * @param params            The parameters on the candidate surface.
+     * @param preserve_contents Indicates that the new surface should be loaded from memory or
+     *                          left blank.
      * @param is_render         Whether or not the surface is a render target.
      **/
     std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr,
-                                          const SurfaceParams& params, bool is_render) {
+                                          const SurfaceParams& params, bool preserve_contents,
+                                          bool is_render) {
         // Step 1
         // Check Level 1 Cache for a fast structural match. If candidate surface
         // matches at certain level we are pretty much done.
@@ -716,8 +831,9 @@ private:
             TSurface& current_surface = iter->second;
             const auto topological_result = current_surface->MatchesTopology(params);
             if (topological_result != MatchTopologyResult::FullMatch) {
-                std::vector<TSurface> overlaps{current_surface};
-                return RecycleSurface(overlaps, params, gpu_addr, topological_result);
+                VectorSurface overlaps{current_surface};
+                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
+                                      topological_result);
             }
 
             const auto struct_result = current_surface->MatchesStructure(params);
@@ -742,7 +858,7 @@ private:
 
         // If none are found, we are done. we just load the surface and create it.
         if (overlaps.empty()) {
-            return InitializeSurface(gpu_addr, params);
+            return InitializeSurface(gpu_addr, params, preserve_contents);
         }
 
         // Step 3
@@ -752,13 +868,15 @@ private:
         for (const auto& surface : overlaps) {
             const auto topological_result = surface->MatchesTopology(params);
             if (topological_result != MatchTopologyResult::FullMatch) {
-                return RecycleSurface(overlaps, params, gpu_addr, topological_result);
+                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
+                                      topological_result);
             }
         }
 
-        // Check if it's a 3D texture
+        // Manage 3D textures
         if (params.block_depth > 0) {
-            auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr);
+            auto surface =
+                Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
             if (surface) {
                 return *surface;
             }
@@ -771,14 +889,12 @@ private:
             // two things either the candidate surface is a supertexture of the overlap
             // or they don't match in any known way.
             if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
-                if (current_surface->GetGpuAddr() == gpu_addr) {
-                    std::optional<std::pair<TSurface, TView>> view =
-                        TryReconstructSurface(overlaps, params, gpu_addr);
-                    if (view) {
-                        return *view;
-                    }
+                const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
+                if (view) {
+                    return *view;
                 }
-                return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
+                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
+                                      MatchTopologyResult::FullMatch);
             }
             // Now we check if the candidate is a mipmap/layer of the overlap
             std::optional<TView> view =
@@ -802,7 +918,7 @@ private:
                         pair.first->EmplaceView(params, gpu_addr, candidate_size);
                     if (mirage_view)
                         return {pair.first, *mirage_view};
-                    return RecycleSurface(overlaps, params, gpu_addr,
+                    return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                           MatchTopologyResult::FullMatch);
                 }
                 return {current_surface, *view};
@@ -818,7 +934,8 @@ private:
             }
         }
         // We failed all the tests, recycle the overlaps into a new texture.
-        return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch);
+        return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
+                              MatchTopologyResult::FullMatch);
     }
 
     /**
@@ -831,8 +948,7 @@ private:
      * @param params   The parameters on the candidate surface.
      **/
     Deduction DeduceSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) {
-        const std::optional<VAddr> cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
 
         if (!cpu_addr) {
             Deduction result{};
@@ -892,7 +1008,9 @@ private:
         params.target = target;
         params.is_tiled = false;
         params.srgb_conversion = false;
-        params.is_layered = false;
+        params.is_layered =
+            target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
+            target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
         params.block_width = 0;
         params.block_height = 0;
         params.block_depth = 0;
@@ -906,7 +1024,7 @@ private:
         params.pitch = 4;
         params.num_levels = 1;
         params.emulated_levels = 1;
-        params.pixel_format = VideoCore::Surface::PixelFormat::R8U;
+        params.pixel_format = VideoCore::Surface::PixelFormat::R8_UNORM;
         params.type = VideoCore::Surface::SurfaceType::ColorTexture;
         auto surface = CreateSurface(0ULL, params);
         invalid_memory.resize(surface->GetHostSizeInBytes(), 0U);
@@ -929,7 +1047,7 @@ private:
     void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,
                         const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {
         auto deduced_src = DeduceSurface(src_gpu_addr, src_params);
-        auto deduced_dst = DeduceSurface(src_gpu_addr, src_params);
+        auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);
         if (deduced_src.Failed() || deduced_dst.Failed()) {
             return;
         }
@@ -976,10 +1094,10 @@ private:
     }
 
     std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params,
-                                                 bool do_load = true) {
+                                                 bool preserve_contents) {
         auto new_surface{GetUncachedSurface(gpu_addr, params)};
         Register(new_surface);
-        if (do_load) {
+        if (preserve_contents) {
             LoadSurface(new_surface);
         }
         return {new_surface, new_surface->GetMainView()};
@@ -987,7 +1105,7 @@ private:
 
     void LoadSurface(const TSurface& surface) {
         staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
-        surface->LoadBuffer(system.GPU().MemoryManager(), staging_cache);
+        surface->LoadBuffer(gpu_memory, staging_cache);
         surface->UploadTexture(staging_cache.GetBuffer(0));
         surface->MarkAsModified(false, Tick());
     }
@@ -998,7 +1116,7 @@ private:
         }
         staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
         surface->DownloadTexture(staging_cache.GetBuffer(0));
-        surface->FlushBuffer(system.GPU().MemoryManager(), staging_cache);
+        surface->FlushBuffer(gpu_memory, staging_cache);
         surface->MarkAsModified(false, Tick());
     }
 
@@ -1025,23 +1143,25 @@ private:
         }
     }
 
-    std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
+    VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
         if (size == 0) {
             return {};
         }
         const VAddr cpu_addr_end = cpu_addr + size;
-        VAddr start = cpu_addr >> registry_page_bits;
         const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
-        std::vector<TSurface> surfaces;
-        while (start <= end) {
-            std::vector<TSurface>& list = registry[start];
-            for (auto& surface : list) {
-                if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
-                    surface->MarkAsPicked(true);
-                    surfaces.push_back(surface);
+        VectorSurface surfaces;
+        for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
+            const auto it = registry.find(start);
+            if (it == registry.end()) {
+                continue;
+            }
+            for (auto& surface : it->second) {
+                if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
+                    continue;
                 }
+                surface->MarkAsPicked(true);
+                surfaces.push_back(surface);
             }
-            start++;
         }
         for (auto& surface : surfaces) {
             surface->MarkAsPicked(false);
@@ -1066,6 +1186,19 @@ private:
         return {};
     }
 
+    /// Try to do an image copy logging when formats are incompatible.
+    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+        const SurfaceParams& src_params = src->GetSurfaceParams();
+        const SurfaceParams& dst_params = dst->GetSurfaceParams();
+        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+                      static_cast<int>(dst_params.pixel_format),
+                      static_cast<int>(src_params.pixel_format));
+            return;
+        }
+        ImageCopy(src, dst, copy);
+    }
+
     constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
         return siblings_table[static_cast<std::size_t>(format)];
     }
@@ -1073,7 +1206,7 @@ private:
     /// Returns true the shader sampler entry is compatible with the TIC texture type.
     static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type,
                                  const VideoCommon::Shader::Sampler& entry) {
-        const auto shader_type = entry.GetType();
+        const auto shader_type = entry.type;
         switch (tic_type) {
         case Tegra::Texture::TextureType::Texture1D:
         case Tegra::Texture::TextureType::Texture1DArray:
@@ -1094,7 +1227,7 @@ private:
             if (shader_type == Tegra::Shader::TextureType::TextureCube) {
                 return true;
             }
-            return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray();
+            return shader_type == Tegra::Shader::TextureType::Texture2D && entry.is_array;
         }
         UNREACHABLE();
         return true;
@@ -1105,9 +1238,19 @@ private:
         TView view;
     };
 
+    void AsyncFlushSurface(TSurface& surface) {
+        if (!uncommitted_flushes) {
+            uncommitted_flushes = std::make_shared<std::list<TSurface>>();
+        }
+        uncommitted_flushes->push_back(surface);
+    }
+
     VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::MemoryManager& gpu_memory;
 
     FormatLookupTable format_lookup_table;
+    FormatCompatibility format_compatibility;
 
     u64 ticks{};
 
@@ -1149,6 +1292,11 @@ private:
     std::unordered_map<u32, TSurface> invalid_cache;
     std::vector<u8> invalid_memory;
 
+    std::list<TSurface> marked_for_unregister;
+
+    std::shared_ptr<std::list<TSurface>> uncommitted_flushes{};
+    std::list<std::shared_ptr<std::list<TSurface>>> committed_flushes;
+
     StagingCache staging_cache;
     std::recursive_mutex mutex;
 };
diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp
index f3efa7eb0..962921483 100644
--- a/src/video_core/textures/convert.cpp
+++ b/src/video_core/textures/convert.cpp
@@ -35,7 +35,7 @@ void SwapS8Z24ToZ24S8(u8* data, u32 width, u32 height) {
     S8Z24 s8z24_pixel{};
     Z24S8 z24s8_pixel{};
     constexpr auto bpp{
-        VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8Z24)};
+        VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM)};
     for (std::size_t y = 0; y < height; ++y) {
         for (std::size_t x = 0; x < width; ++x) {
             const std::size_t offset{bpp * (y * width + x)};
@@ -73,7 +73,7 @@ void ConvertFromGuestToHost(u8* in_data, u8* out_data, PixelFormat pixel_format,
             in_data, width, height, depth, block_width, block_height);
         std::copy(rgba8_data.begin(), rgba8_data.end(), out_data);
 
-    } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) {
+    } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) {
         Tegra::Texture::ConvertS8Z24ToZ24S8(in_data, width, height);
     }
 }
@@ -85,7 +85,7 @@ void ConvertFromHostToGuest(u8* data, PixelFormat pixel_format, u32 width, u32 h
                      static_cast<u32>(pixel_format));
         UNREACHABLE();
 
-    } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) {
+    } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) {
         Tegra::Texture::ConvertZ24S8ToS8Z24(data, width, height);
     }
 }
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7df5f1452..16d46a018 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -6,11 +6,13 @@
 #include <cstring>
 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/bit_util.h"
 #include "video_core/gpu.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/textures/texture.h"
 
 namespace Tegra::Texture {
+namespace {
 
 /**
  * This table represents the internal swizzle of a gob,
@@ -36,20 +38,10 @@ struct alignas(64) SwizzleTable {
     std::array<std::array<u16, M>, N> values{};
 };
 
-constexpr u32 gob_size_x_shift = 6;
-constexpr u32 gob_size_y_shift = 3;
-constexpr u32 gob_size_z_shift = 0;
-constexpr u32 gob_size_shift = gob_size_x_shift + gob_size_y_shift + gob_size_z_shift;
+constexpr u32 FAST_SWIZZLE_ALIGN = 16;
 
-constexpr u32 gob_size_x = 1U << gob_size_x_shift;
-constexpr u32 gob_size_y = 1U << gob_size_y_shift;
-constexpr u32 gob_size_z = 1U << gob_size_z_shift;
-constexpr u32 gob_size = 1U << gob_size_shift;
-
-constexpr u32 fast_swizzle_align = 16;
-
-constexpr auto legacy_swizzle_table = SwizzleTable<gob_size_y, gob_size_x, gob_size_z>();
-constexpr auto fast_swizzle_table = SwizzleTable<gob_size_y, 4, fast_swizzle_align>();
+constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>();
+constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>();
 
 /**
  * This function manages ALL the GOBs(Group of Bytes) Inside a single block.
@@ -68,17 +60,17 @@ void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, con
         u32 y_address = z_address;
         u32 pixel_base = layer_z * z + y_start * stride_x;
         for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = legacy_swizzle_table[y % gob_size_y];
+            const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];
             for (u32 x = x_start; x < x_end; x++) {
-                const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % gob_size_x]};
+                const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]};
                 const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};
                 data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
                 data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
                 std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
             }
             pixel_base += stride_x;
-            if ((y + 1) % gob_size_y == 0)
-                y_address += gob_size;
+            if ((y + 1) % GOB_SIZE_Y == 0)
+                y_address += GOB_SIZE;
         }
         z_address += xy_block_size;
     }
@@ -103,18 +95,18 @@ void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const
         u32 y_address = z_address;
         u32 pixel_base = layer_z * z + y_start * stride_x;
         for (u32 y = y_start; y < y_end; y++) {
-            const auto& table = fast_swizzle_table[y % gob_size_y];
-            for (u32 xb = x_startb; xb < x_endb; xb += fast_swizzle_align) {
-                const u32 swizzle_offset{y_address + table[(xb / fast_swizzle_align) % 4]};
+            const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y];
+            for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) {
+                const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]};
                 const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
                 const u32 pixel_index{out_x + pixel_base};
                 data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset;
                 data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index;
-                std::memcpy(data_ptrs[0], data_ptrs[1], fast_swizzle_align);
+                std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN);
             }
             pixel_base += stride_x;
-            if ((y + 1) % gob_size_y == 0)
-                y_address += gob_size;
+            if ((y + 1) % GOB_SIZE_Y == 0)
+                y_address += GOB_SIZE;
         }
         z_address += xy_block_size;
     }
@@ -137,9 +129,9 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool
     auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
     const u32 stride_x = width * out_bytes_per_pixel;
     const u32 layer_z = height * stride_x;
-    const u32 gob_elements_x = gob_size_x / bytes_per_pixel;
-    constexpr u32 gob_elements_y = gob_size_y;
-    constexpr u32 gob_elements_z = gob_size_z;
+    const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel;
+    constexpr u32 gob_elements_y = GOB_SIZE_Y;
+    constexpr u32 gob_elements_z = GOB_SIZE_Z;
     const u32 block_x_elements = gob_elements_x;
     const u32 block_y_elements = gob_elements_y * block_height;
     const u32 block_z_elements = gob_elements_z * block_depth;
@@ -147,7 +139,7 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool
     const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements);
     const u32 blocks_on_y = div_ceil(height, block_y_elements);
     const u32 blocks_on_z = div_ceil(depth, block_z_elements);
-    const u32 xy_block_size = gob_size * block_height;
+    const u32 xy_block_size = GOB_SIZE * block_height;
     const u32 block_size = xy_block_size * block_depth;
     u32 tile_offset = 0;
     for (u32 zb = 0; zb < blocks_on_z; zb++) {
@@ -174,12 +166,14 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool
     }
 }
 
+} // Anonymous namespace
+
 void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,
                       u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data,
                       bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) {
     const u32 block_height_size{1U << block_height};
     const u32 block_depth_size{1U << block_depth};
-    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % fast_swizzle_align == 0) {
+    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) {
         SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,
                            bytes_per_pixel, out_bytes_per_pixel, block_height_size,
                            block_depth_size, width_spacing);
@@ -190,53 +184,6 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,
     }
 }
 
-u32 BytesPerPixel(TextureFormat format) {
-    switch (format) {
-    case TextureFormat::DXT1:
-    case TextureFormat::DXN1:
-        // In this case a 'pixel' actually refers to a 4x4 tile.
-        return 8;
-    case TextureFormat::DXT23:
-    case TextureFormat::DXT45:
-    case TextureFormat::DXN2:
-    case TextureFormat::BC7U:
-    case TextureFormat::BC6H_UF16:
-    case TextureFormat::BC6H_SF16:
-        // In this case a 'pixel' actually refers to a 4x4 tile.
-        return 16;
-    case TextureFormat::R32_G32_B32:
-        return 12;
-    case TextureFormat::ASTC_2D_4X4:
-    case TextureFormat::ASTC_2D_5X4:
-    case TextureFormat::ASTC_2D_8X8:
-    case TextureFormat::ASTC_2D_8X5:
-    case TextureFormat::ASTC_2D_10X8:
-    case TextureFormat::ASTC_2D_5X5:
-    case TextureFormat::A8R8G8B8:
-    case TextureFormat::A2B10G10R10:
-    case TextureFormat::BF10GF11RF11:
-    case TextureFormat::R32:
-    case TextureFormat::R16_G16:
-        return 4;
-    case TextureFormat::A1B5G5R5:
-    case TextureFormat::B5G6R5:
-    case TextureFormat::G8R8:
-    case TextureFormat::R16:
-        return 2;
-    case TextureFormat::R8:
-        return 1;
-    case TextureFormat::R16_G16_B16_A16:
-        return 8;
-    case TextureFormat::R32_G32_B32_A32:
-        return 16;
-    case TextureFormat::R32_G32:
-        return 8;
-    default:
-        UNIMPLEMENTED_MSG("Format not implemented");
-        return 1;
-    }
-}
-
 void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,
                       u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height,
                       u32 block_depth, u32 width_spacing) {
@@ -256,47 +203,82 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,
 }
 
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
+                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
                     u32 block_height_bit, u32 offset_x, u32 offset_y) {
     const u32 block_height = 1U << block_height_bit;
-    const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
-                                  gob_size_x};
+    const u32 image_width_in_gobs =
+        (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
     for (u32 line = 0; line < subrect_height; ++line) {
         const u32 dst_y = line + offset_y;
         const u32 gob_address_y =
-            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
-            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
+            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
+            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
+        const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
         for (u32 x = 0; x < subrect_width; ++x) {
             const u32 dst_x = x + offset_x;
             const u32 gob_address =
-                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
-            u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
-            u8* dest_addr = swizzled_data + swizzled_offset;
+                gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X];
+            const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel;
 
+            const u8* const source_line = unswizzled_data + unswizzled_offset;
+            u8* const dest_addr = swizzled_data + swizzled_offset;
             std::memcpy(dest_addr, source_line, bytes_per_pixel);
         }
     }
 }
 
-void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
-                      u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
-                      u32 block_height_bit, u32 offset_x, u32 offset_y) {
-    const u32 block_height = 1U << block_height_bit;
-    for (u32 line = 0; line < subrect_height; ++line) {
-        const u32 y2 = line + offset_y;
-        const u32 gob_address_y = (y2 / (gob_size_y * block_height)) * gob_size * block_height +
-                                  ((y2 % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[y2 % gob_size_y];
-        for (u32 x = 0; x < subrect_width; ++x) {
-            const u32 x2 = (x + offset_x) * bytes_per_pixel;
-            const u32 gob_address = gob_address_y + (x2 / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[x2 % gob_size_x];
-            u8* dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel;
-            u8* source_addr = swizzled_data + swizzled_offset;
+void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
+                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
+    const u32 stride = width * bytes_per_pixel;
+    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
+    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
+
+    const u32 block_height_mask = (1U << block_height) - 1;
+    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height;
+
+    for (u32 line = 0; line < line_count; ++line) {
+        const u32 src_y = line + origin_y;
+        const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
+
+        const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
+        const u32 src_offset_y = (block_y >> block_height) * block_size +
+                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
+        for (u32 column = 0; column < line_length_in; ++column) {
+            const u32 src_x = (column + origin_x) * bytes_per_pixel;
+            const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
+
+            const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X];
+            const u32 unswizzled_offset = line * pitch + column * bytes_per_pixel;
+
+            std::memcpy(output + unswizzled_offset, input + swizzled_offset, bytes_per_pixel);
+        }
+    }
+}
+
+void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
+                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
+                         u32 origin_y, u8* output, const u8* input) {
+    UNIMPLEMENTED_IF(origin_x > 0);
+    UNIMPLEMENTED_IF(origin_y > 0);
 
-            std::memcpy(dest_line, source_addr, bytes_per_pixel);
+    const u32 stride = width * bytes_per_pixel;
+    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
+    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
+
+    const u32 block_height_mask = (1U << block_height) - 1;
+    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
+
+    for (u32 line = 0; line < line_count; ++line) {
+        const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y];
+        const u32 block_y = line / GOB_SIZE_Y;
+        const u32 dst_offset_y =
+            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
+        for (u32 x = 0; x < line_length_in; ++x) {
+            const u32 dst_offset =
+                ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X];
+            const u32 src_offset = x * bytes_per_pixel + line * pitch;
+            std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel);
         }
     }
 }
@@ -305,17 +287,17 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
                    const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
                    u8* swizzle_data) {
     const u32 block_height = 1U << block_height_bit;
-    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
+    const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};
     std::size_t count = 0;
     for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
         const std::size_t gob_address_y =
-            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
-            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[y % gob_size_y];
+            (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
+            ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
+        const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];
         for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
             const std::size_t gob_address =
-                gob_address_y + (x / gob_size_x) * gob_size * block_height;
-            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
+                gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
+            const std::size_t swizzled_offset = gob_address + table[x % GOB_SIZE_X];
             const u8* source_line = source_data + count;
             u8* dest_addr = swizzle_data + swizzled_offset;
             count++;
@@ -325,58 +307,30 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
     }
 }
 
-std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
-                              u32 height) {
-    std::vector<u8> rgba_data;
-
-    // TODO(Subv): Implement.
-    switch (format) {
-    case TextureFormat::DXT1:
-    case TextureFormat::DXT23:
-    case TextureFormat::DXT45:
-    case TextureFormat::DXN1:
-    case TextureFormat::DXN2:
-    case TextureFormat::BC7U:
-    case TextureFormat::BC6H_UF16:
-    case TextureFormat::BC6H_SF16:
-    case TextureFormat::ASTC_2D_4X4:
-    case TextureFormat::ASTC_2D_8X8:
-    case TextureFormat::ASTC_2D_5X5:
-    case TextureFormat::ASTC_2D_10X8:
-    case TextureFormat::A8R8G8B8:
-    case TextureFormat::A2B10G10R10:
-    case TextureFormat::A1B5G5R5:
-    case TextureFormat::B5G6R5:
-    case TextureFormat::R8:
-    case TextureFormat::G8R8:
-    case TextureFormat::BF10GF11RF11:
-    case TextureFormat::R32_G32_B32_A32:
-    case TextureFormat::R32_G32:
-    case TextureFormat::R32:
-    case TextureFormat::R16:
-    case TextureFormat::R16_G16:
-    case TextureFormat::R32_G32_B32:
-        // TODO(Subv): For the time being just forward the same data without any decoding.
-        rgba_data = texture_data;
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Format not implemented");
-        break;
-    }
-
-    return rgba_data;
-}
-
 std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                           u32 block_height, u32 block_depth) {
     if (tiled) {
-        const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, gob_size_x_shift);
-        const u32 aligned_height = Common::AlignBits(height, gob_size_y_shift + block_height);
-        const u32 aligned_depth = Common::AlignBits(depth, gob_size_z_shift + block_depth);
+        const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, GOB_SIZE_X_SHIFT);
+        const u32 aligned_height = Common::AlignBits(height, GOB_SIZE_Y_SHIFT + block_height);
+        const u32 aligned_depth = Common::AlignBits(depth, GOB_SIZE_Z_SHIFT + block_depth);
         return aligned_width * aligned_height * aligned_depth;
     } else {
         return width * height * depth * bytes_per_pixel;
     }
 }
 
+u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
+                 u32 bytes_per_pixel) {
+    auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
+    const u32 gobs_in_block = 1 << block_height;
+    const u32 y_blocks = GOB_SIZE_Y << block_height;
+    const u32 x_per_gob = GOB_SIZE_X / bytes_per_pixel;
+    const u32 x_blocks = div_ceil(width, x_per_gob);
+    const u32 block_size = GOB_SIZE * gobs_in_block;
+    const u32 stride = block_size * x_blocks;
+    const u32 base = (dst_y / y_blocks) * stride + (dst_x / x_per_gob) * block_size;
+    const u32 relative_y = dst_y % y_blocks;
+    return base + (relative_y / GOB_SIZE_Y) * GOB_SIZE;
+}
+
 } // namespace Tegra::Texture
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e5eac3f3b..01e156bc8 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -10,15 +10,15 @@
 
 namespace Tegra::Texture {
 
-// GOBSize constant. Calculated by 64 bytes in x multiplied by 8 y coords, represents
-// an small rect of (64/bytes_per_pixel)X8.
-inline std::size_t GetGOBSize() {
-    return 512;
-}
+constexpr u32 GOB_SIZE_X = 64;
+constexpr u32 GOB_SIZE_Y = 8;
+constexpr u32 GOB_SIZE_Z = 1;
+constexpr u32 GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
 
-inline std::size_t GetGOBSizeShift() {
-    return 9;
-}
+constexpr std::size_t GOB_SIZE_X_SHIFT = 6;
+constexpr std::size_t GOB_SIZE_Y_SHIFT = 3;
+constexpr std::size_t GOB_SIZE_Z_SHIFT = 0;
+constexpr std::size_t GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
 
 /// Unswizzles a swizzled texture without changing its format.
 void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y,
@@ -38,26 +38,42 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,
                       u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
                       bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing);
 
-/// Decodes an unswizzled texture into a A8R8G8B8 texture.
-std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
-                              u32 height);
-
 /// This function calculates the correct size of a texture depending if it's tiled or not.
 std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                           u32 block_height, u32 block_depth);
 
 /// Copies an untiled subrectangle into a tiled surface.
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
-                    u32 offset_x, u32 offset_y);
+                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
+                    u32 block_height_bit, u32 offset_x, u32 offset_y);
 
 /// Copies a tiled subrectangle into a linear surface.
-void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
-                      u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
-                      u32 offset_x, u32 offset_y);
+void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
+                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input);
+
+/// @brief Swizzles a 2D array of pixels into a 3D texture
+/// @param line_length_in  Number of pixels per line
+/// @param line_count      Number of lines
+/// @param pitch           Number of bytes per line
+/// @param width           Width of the swizzled texture
+/// @param height          Height of the swizzled texture
+/// @param bytes_per_pixel Number of bytes used per pixel
+/// @param block_height    Block height shift
+/// @param block_depth     Block depth shift
+/// @param origin_x        Column offset in pixels of the swizzled texture
+/// @param origin_y        Row offset in pixels of the swizzled texture
+/// @param output          Pointer to the pixels of the swizzled texture
+/// @param input           Pointer to the 2D array of pixels used as input
+/// @pre input and output points to an array large enough to hold the number of bytes used
+void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
+                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
+                         u32 origin_y, u8* output, const u8* input);
+
+void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
+                   std::size_t copy_size, const u8* source_data, u8* swizzle_data);
 
-void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
-                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
-                   u8* swizzle_data);
+/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
+u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
+                 u32 bytes_per_pixel);
 
 } // namespace Tegra::Texture
diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp
index d1939d744..4171e3ef2 100644
--- a/src/video_core/textures/texture.cpp
+++ b/src/video_core/textures/texture.cpp
@@ -48,7 +48,7 @@ constexpr std::array<float, 256> SRGB_CONVERSION_LUT = {
 };
 
 unsigned SettingsMinimumAnisotropy() noexcept {
-    switch (static_cast<Anisotropy>(Settings::values.max_anisotropy)) {
+    switch (static_cast<Anisotropy>(Settings::values.max_anisotropy.GetValue())) {
     default:
     case Anisotropy::Default:
         return 1U;
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index eba05aced..0574fef12 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -12,10 +12,10 @@
 namespace Tegra::Texture {
 
 enum class TextureFormat : u32 {
-    R32_G32_B32_A32 = 0x01,
-    R32_G32_B32 = 0x02,
-    R16_G16_B16_A16 = 0x03,
-    R32_G32 = 0x04,
+    R32G32B32A32 = 0x01,
+    R32G32B32 = 0x02,
+    R16G16B16A16 = 0x03,
+    R32G32 = 0x04,
     R32_B24G8 = 0x05,
     ETC2_RGB = 0x06,
     X8B8G8R8 = 0x07,
@@ -23,19 +23,19 @@ enum class TextureFormat : u32 {
     A2B10G10R10 = 0x09,
     ETC2_RGB_PTA = 0x0a,
     ETC2_RGBA = 0x0b,
-    R16_G16 = 0x0c,
-    G8R24 = 0x0d,
-    G24R8 = 0x0e,
+    R16G16 = 0x0c,
+    R24G8 = 0x0d,
+    R8G24 = 0x0e,
     R32 = 0x0f,
-    BC6H_SF16 = 0x10,
-    BC6H_UF16 = 0x11,
+    BC6H_SFLOAT = 0x10,
+    BC6H_UFLOAT = 0x11,
     A4B4G4R4 = 0x12,
     A5B5G5R1 = 0x13,
     A1B5G5R5 = 0x14,
     B5G6R5 = 0x15,
     B6G5R5 = 0x16,
-    BC7U = 0x17,
-    G8R8 = 0x18,
+    BC7 = 0x17,
+    R8G8 = 0x18,
     EAC = 0x19,
     EACX2 = 0x1a,
     R16 = 0x1b,
@@ -43,23 +43,23 @@ enum class TextureFormat : u32 {
     R8 = 0x1d,
     G4R4 = 0x1e,
     R1 = 0x1f,
-    E5B9G9R9_SHAREDEXP = 0x20,
-    BF10GF11RF11 = 0x21,
+    E5B9G9R9 = 0x20,
+    B10G11R11 = 0x21,
     G8B8G8R8 = 0x22,
     B8G8R8G8 = 0x23,
-    DXT1 = 0x24,
-    DXT23 = 0x25,
-    DXT45 = 0x26,
-    DXN1 = 0x27,
-    DXN2 = 0x28,
-    S8Z24 = 0x29,
+    BC1_RGBA = 0x24,
+    BC2 = 0x25,
+    BC3 = 0x26,
+    BC4 = 0x27,
+    BC5 = 0x28,
+    S8D24 = 0x29,
     X8Z24 = 0x2a,
-    Z24S8 = 0x2b,
+    D24S8 = 0x2b,
     X4V4Z24__COV4R4V = 0x2c,
     X4V4Z24__COV8R8V = 0x2d,
     V8Z24__COV4R12V = 0x2e,
-    ZF32 = 0x2f,
-    ZF32_X24S8 = 0x30,
+    D32 = 0x2f,
+    D32S8 = 0x30,
     X8Z24_X20V4S8__COV4R4V = 0x31,
     X8Z24_X20V4S8__COV8R8V = 0x32,
     ZF32_X20V4X8__COV4R4V = 0x33,
@@ -69,7 +69,7 @@ enum class TextureFormat : u32 {
     X8Z24_X16V8S8__COV4R12V = 0x37,
     ZF32_X16V8X8__COV4R12V = 0x38,
     ZF32_X16V8S8__COV4R12V = 0x39,
-    Z16 = 0x3a,
+    D16 = 0x3a,
     V8Z24__COV8R24V = 0x3b,
     X8Z24_X16V8S8__COV8R24V = 0x3c,
     ZF32_X16V8X8__COV8R24V = 0x3d,
@@ -375,7 +375,4 @@ struct FullTextureInfo {
     TSCEntry tsc;
 };
 
-/// Returns the number of bytes per pixel of the input texture format.
-u32 BytesPerPixel(TextureFormat format);
-
 } // namespace Tegra::Texture
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index f60bdc60a..dd5cee4a1 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <memory>
+
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/settings.h"
@@ -16,43 +17,56 @@
 #include "video_core/video_core.h"
 
 namespace {
-std::unique_ptr<VideoCore::RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
-                                                        Core::System& system,
-                                                        Core::Frontend::GraphicsContext& context) {
-    switch (Settings::values.renderer_backend) {
+
+std::unique_ptr<VideoCore::RendererBase> CreateRenderer(
+    Core::System& system, Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+    std::unique_ptr<Core::Frontend::GraphicsContext> context) {
+    auto& telemetry_session = system.TelemetrySession();
+    auto& cpu_memory = system.Memory();
+
+    switch (Settings::values.renderer_backend.GetValue()) {
     case Settings::RendererBackend::OpenGL:
-        return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system, context);
+        return std::make_unique<OpenGL::RendererOpenGL>(telemetry_session, emu_window, cpu_memory,
+                                                        gpu, std::move(context));
 #ifdef HAS_VULKAN
     case Settings::RendererBackend::Vulkan:
-        return std::make_unique<Vulkan::RendererVulkan>(emu_window, system);
+        return std::make_unique<Vulkan::RendererVulkan>(telemetry_session, emu_window, cpu_memory,
+                                                        gpu, std::move(context));
 #endif
     default:
         return nullptr;
     }
 }
+
 } // Anonymous namespace
 
 namespace VideoCore {
 
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
+    std::unique_ptr<Tegra::GPU> gpu;
+    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
+    if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+        gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);
+    } else {
+        gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);
+    }
+
     auto context = emu_window.CreateSharedContext();
     const auto scope = context->Acquire();
-    auto renderer = CreateRenderer(emu_window, system, *context);
+
+    auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context));
     if (!renderer->Init()) {
         return nullptr;
     }
 
-    if (Settings::values.use_asynchronous_gpu_emulation) {
-        return std::make_unique<VideoCommon::GPUAsynch>(system, std::move(renderer),
-                                                        std::move(context));
-    }
-    return std::make_unique<VideoCommon::GPUSynch>(system, std::move(renderer), std::move(context));
+    gpu->BindRenderer(std::move(renderer));
+    return gpu;
 }
 
 u16 GetResolutionScaleFactor(const RendererBase& renderer) {
     return static_cast<u16>(
-        Settings::values.resolution_factor != 0
-            ? Settings::values.resolution_factor
+        Settings::values.resolution_factor.GetValue() != 0
+            ? Settings::values.resolution_factor.GetValue()
             : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio());
 }