82 files changed, 4462 insertions, 1440 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2bf8d68ce..21c46a567 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    compatible_formats.cpp
+    compatible_formats.h
     dirty_flags.cpp
     dirty_flags.h
     dma_pusher.cpp
@@ -27,6 +29,8 @@ add_library(video_core STATIC
     engines/shader_type.h
     macro/macro.cpp
     macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
     macro/macro_interpreter.cpp
     macro/macro_interpreter.h
     macro/macro_jit_x64.cpp
@@ -49,11 +53,11 @@ add_library(video_core STATIC
     query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
-    rasterizer_cache.cpp
-    rasterizer_cache.h
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
     renderer_opengl/gl_device.cpp
@@ -93,6 +97,7 @@ add_library(video_core STATIC
     renderer_opengl/utils.h
     sampler_cache.cpp
     sampler_cache.h
+    shader_cache.h
     shader/decode/arithmetic.cpp
     shader/decode/arithmetic_immediate.cpp
     shader/decode/bfe.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
 
 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
         return cpu_addr <= other_start && other_end <= cpu_addr_end;
     }
 
-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
         return static_cast<std::size_t>(in_addr - cpu_addr);
     }
 
-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
         return cpu_addr;
     }
 
-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
         return cpu_addr_end;
     }
 
-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
         cpu_addr = new_addr;
         cpu_addr_end = new_addr + size;
     }
 
-    std::size_t GetSize() const {
+    std::size_t Size() const {
         return size;
     }
 
-    void SetEpoch(u64 new_epoch) {
-        epoch = new_epoch;
+    u64 Epoch() const {
+        return epoch;
     }
 
-    u64 GetEpoch() {
-        return epoch;
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
     }
 
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
     }
-    ~BufferBlock() = default;
 
 private:
     VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b88fce2cd..cf8bdd021 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -30,23 +30,31 @@
 
 namespace VideoCommon {
 
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
     using IntervalSet = boost::icl::interval_set<VAddr>;
     using IntervalType = typename IntervalSet::interval_type;
     using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
 
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
-    using BufferInfo = std::pair<BufferType, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        const auto& memory_manager = system.GPU().MemoryManager();
+        auto& memory_manager = system.GPU().MemoryManager();
         const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         const VAddr cpu_addr = *cpu_addr_opt;
 
@@ -55,7 +63,6 @@ public:
         constexpr std::size_t max_stream_size = 0x800;
         if (use_fast_cbuf || size < max_stream_size) {
             if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                auto& memory_manager = system.GPU().MemoryManager();
                 const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
                 if (use_fast_cbuf) {
                     u8* dest;
@@ -82,10 +89,10 @@ public:
             }
         }
 
-        OwnerBuffer block = GetBlock(cpu_addr, size);
+        Buffer* const block = GetBlock(cpu_addr, size);
         MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
         if (!map) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
@@ -98,7 +105,7 @@ public:
             }
         }
 
-        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -110,31 +117,37 @@ public:
         });
     }
 
-    void Map(std::size_t max_size) {
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    bool Map(std::size_t max_size) {
         std::lock_guard lock{mutex};
 
+        bool invalidated;
         std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
         buffer_offset = buffer_offset_base;
+
+        return invalidated;
     }
 
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
+    /// Finishes the upload stream
+    void Unmap() {
         std::lock_guard lock{mutex};
-
         stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
     }
 
+    /// Function called at the end of each frame, inteded for deferred operations
     void TickFrame() {
         ++epoch;
+
         while (!pending_destruction.empty()) {
             // Delay at least 4 frames before destruction.
             // This is due to triple buffering happening on some drivers.
             static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                 break;
             }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
         }
     }
 
@@ -245,28 +258,16 @@ public:
         committed_flushes.pop_front();
     }
 
-    virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
 protected:
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
                          std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
 
     ~BufferCache() = default;
 
-    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
-
-    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
-    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
-
-    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
-
-    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
         return {};
@@ -321,7 +322,7 @@ protected:
     }
 
 private:
-    MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
+    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
                             std::size_t size) {
         const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
@@ -329,11 +330,11 @@ private:
             const VAddr cpu_addr_end = cpu_addr + size;
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
@@ -376,7 +377,7 @@ private:
         return map;
     }
 
-    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
+    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
                      const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
@@ -386,13 +387,13 @@ private:
             interval_set.subtract(subtract);
         }
         for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
             }
+            staging_buffer.resize(size);
+            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
@@ -422,10 +423,14 @@ private:
     }
 
     void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
         const std::size_t size = map->end - map->start;
-        OwnerBuffer block = blocks[map->start >> block_page_bits];
         staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
         system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
@@ -438,7 +443,7 @@ private:
 
         buffer_ptr += size;
         buffer_offset += size;
-        return {stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
     }
 
     void AlignBuffer(std::size_t alignment) {
@@ -448,97 +453,89 @@ private:
         buffer_offset = offset_aligned;
     }
 
-    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
+
         return new_buffer;
     }
 
-    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
         const VAddr new_addr = std::min(first_addr, second_addr);
         const std::size_t new_size = size_1 + size_2;
-        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
         const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
         return new_buffer;
     }
 
-    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        OwnerBuffer found;
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
         const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
             auto it = blocks.find(page_start);
             if (it == blocks.end()) {
                 if (found) {
                     found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                 }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
             }
-            ++page_start;
         }
-        return found;
+        return found.get();
     }
 
-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 it->second = it->second + 1;
             } else {
-                written_pages[page_start] = 1;
+                written_pages.insert_or_assign(page_start, 1);
             }
-            ++page_start;
         }
     }
 
-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 if (it->second > 1) {
@@ -547,22 +544,24 @@ private:
                     written_pages.erase(it);
                 }
             }
-            ++page_start;
         }
     }
 
-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             if (written_pages.count(page_start) > 0) {
                 return true;
             }
-            ++page_start;
         }
         return false;
     }
 
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
     void MarkForAsyncFlush(MapInterval* map) {
         if (!uncommitted_flushes) {
             uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
@@ -574,9 +573,7 @@ private:
     Core::System& system;
 
     std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
+    BufferType stream_buffer_handle;
 
     u8* buffer_ptr = nullptr;
     u64 buffer_offset = 0;
@@ -586,18 +583,15 @@ private:
     boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
         mapped_addresses;
 
-    static constexpr u64 write_page_bit = 11;
     std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
 
-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, OwnerBuffer> blocks;
-
-    std::list<OwnerBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
     u64 epoch = 0;
     u64 modified_ticks = 0;
 
     std::vector<u8> staging_buffer;
+
     std::list<MapInterval*> marked_for_unregister;
 
     std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..6c426b035
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,162 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+    PixelFormat::RGBA32F,
+    PixelFormat::RGBA32UI,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+    PixelFormat::RGB32F,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S,
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+    PixelFormat::RG16F,        PixelFormat::R11FG11FB10F, PixelFormat::R32F,
+    PixelFormat::A2B10G10R10U, PixelFormat::RG16UI,       PixelFormat::R32UI,
+    PixelFormat::RG16I,        PixelFormat::R32I,         PixelFormat::ABGR8U,
+    PixelFormat::RG16,         PixelFormat::ABGR8S,       PixelFormat::RG16S,
+    PixelFormat::RGBA8_SRGB,   PixelFormat::E5B9G9R9F,    PixelFormat::BGRA8,
+    PixelFormat::BGRA8_SRGB,
+};
+// Missing formats:
+// PixelFormat::RGBA8UI
+// PixelFormat::RGBA8I
+// PixelFormat::RGB10_A2_UI
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+    PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I,
+    PixelFormat::RG8U, PixelFormat::R16U,  PixelFormat::RG8S,  PixelFormat::R16S,
+};
+// Missing formats:
+// PixelFormat::RG8I
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+    PixelFormat::R8UI,
+    PixelFormat::R8U,
+};
+// Missing formats:
+// PixelFormat::R8I
+// PixelFormat::R8S
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+    PixelFormat::DXN1,
+};
+// Missing formats:
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+    PixelFormat::DXN2UNORM,
+    PixelFormat::DXN2SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+    PixelFormat::BC7U,
+    PixelFormat::BC7U_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+    PixelFormat::BC6H_SF16,
+    PixelFormat::BC6H_UF16,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+    PixelFormat::RGBA32UI,   PixelFormat::RGBA32F,   PixelFormat::DXT23,
+    PixelFormat::DXT23_SRGB, PixelFormat::DXT45,     PixelFormat::DXT45_SRGB,
+    PixelFormat::DXN2SNORM,  PixelFormat::BC7U,      PixelFormat::BC7U_SRGB,
+    PixelFormat::BC6H_SF16,  PixelFormat::BC6H_UF16,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI,  PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1,
+
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I,
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+    compatiblity[format_a][format_b] = true;
+    compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(compatibility, *it_a, *it_b);
+        }
+    }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+    copy = view;
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..d1082566d
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+    explicit FormatCompatibility();
+
+    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+private:
+    Table view;
+    Table copy;
+};
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
     virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
     virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                     u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
     virtual u32 GetBoundBuffer() const = 0;
 
     virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index f6237fc6a..a82b06a38 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
     ASSERT(stage == ShaderType::Compute);
     const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 18ceedfaf..b7f668d88 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index e46b153f9..c01436295 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_engine->Execute(macro_positions[entry], parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
@@ -740,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
     const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& tex_info_buffer = shader.const_buffers[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index b827b112f..ef1618990 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -598,6 +598,7 @@ public:
                 BitField<4, 3, u32> block_height;
                 BitField<8, 3, u32> block_depth;
                 BitField<12, 1, InvMemoryLayout> type;
+                BitField<16, 1, u32> is_3d;
             } memory_layout;
             union {
                 BitField<0, 16, u32> layers;
@@ -1403,6 +1404,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
@@ -1415,6 +1418,14 @@ public:
         return execute_on;
     }
 
+    VideoCore::RasterizerInterface& GetRasterizer() {
+        return rasterizer;
+    }
+
+    const VideoCore::RasterizerInterface& GetRasterizer() const {
+        return rasterizer;
+    }
+
     /// Notify a memory write has happened.
     void OnMemoryWrite() {
         dirty.flags |= dirty.on_write_stores;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e7cb87589..d374b73cf 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -661,6 +661,10 @@ union Instruction {
     constexpr Instruction(u64 value) : value{value} {}
     constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
+    constexpr bool Bit(u64 offset) const {
+        return ((value >> offset) & 1) != 0;
+    }
+
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
     union {
@@ -1874,7 +1878,9 @@ public:
         HSETP2_C,
         HSETP2_R,
         HSETP2_IMM,
+        HSET2_C,
         HSET2_R,
+        HSET2_IMM,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -2194,7 +2200,9 @@ private:
             INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8eb017f65..482e49711 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
+
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
@@ -154,8 +156,7 @@ u64 GPU::GetTicks() const {
     constexpr u64 gpu_ticks_num = 384;
     constexpr u64 gpu_ticks_den = 625;
 
-    const u64 cpu_ticks = system.CoreTiming().GetTicks();
-    u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
     if (Settings::values.use_fast_gpu_time) {
         nanoseconds /= 256;
     }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a1b4c305c..2c42483bd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -284,6 +284,12 @@ public:
     /// core timing events.
     virtual void Start() = 0;
 
+    /// Obtain the CPU Context
+    virtual void ObtainContext() = 0;
+
+    /// Release the CPU Context
+    virtual void ReleaseContext() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 53305ab43..7b855f63e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    cpu_context->MakeCurrent();
     gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
 }
 
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 517658612..15e9f1d38 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -25,6 +25,8 @@ public:
     ~GPUAsynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..aaeb9811d 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase
 
 GPUSynch::~GPUSynch() = default;
 
-void GPUSynch::Start() {
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
     context->MakeCurrent();
 }
 
+void GPUSynch::ReleaseContext() {
+    context->DoneCurrent();
+}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..762c20aa5 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -24,6 +24,8 @@ public:
     ~GPUSynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c3bb4fe06..738c6f0c1 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/settings.h"
@@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread {
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-    MicroProfileOnThreadCreate("GpuThread");
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
     while (state.queue.Empty())
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index 89077a2d8..a50e7b4e0 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,32 +2,78 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <optional>
+#include <boost/container_hash/hash.hpp>
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
 #include "video_core/macro/macro_interpreter.h"
 #include "video_core/macro/macro_jit_x64.h"
 
 namespace Tegra {
 
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
 void MacroEngine::AddCode(u32 method, u32 data) {
     uploaded_macro_code[method].push_back(data);
 }
 
-void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
     auto compiled_macro = macro_cache.find(method);
     if (compiled_macro != macro_cache.end()) {
-        compiled_macro->second->Execute(parameters, method);
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
     } else {
         // Macro not compiled, check if it's uploaded and if so, compile it
-        auto macro_code = uploaded_macro_code.find(method);
+        std::optional<u32> mid_method = std::nullopt;
+        const auto macro_code = uploaded_macro_code.find(method);
         if (macro_code == uploaded_macro_code.end()) {
-            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
-            return;
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
         }
-        macro_cache[method] = Compile(macro_code->second);
-        macro_cache[method]->Execute(parameters, method);
     }
 }
 
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index b76ed891f..4d00b84b0 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -11,9 +11,11 @@
 #include "common/common_types.h"
 
 namespace Tegra {
+
 namespace Engines {
 class Maxwell3D;
 }
+
 namespace Macro {
 constexpr std::size_t NUM_MACRO_REGISTERS = 8;
 enum class Operation : u32 {
@@ -94,6 +96,8 @@ union MethodAddress {
 
 } // namespace Macro
 
+class HLEMacro;
+
 class CachedMacro {
 public:
     virtual ~CachedMacro() = default;
@@ -107,20 +111,29 @@ public:
 
 class MacroEngine {
 public:
-    virtual ~MacroEngine() = default;
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
 
     // Store the uploaded macro code to compile them when they're called.
     void AddCode(u32 method, u32 data);
 
     // Compiles the macro if its not in the cache, and executes the compiled macro
-    void Execute(u32 method, const std::vector<u32>& parameters);
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
 
 protected:
     virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
 
 private:
-    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
     std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
 };
 
 std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..410f99018
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+                                                                        ~(0x3ffffff << 26)));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 5edff27aa..aa5256419 100644
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -11,7 +11,8 @@
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index 11c1cc3be..07292702f 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -14,27 +14,22 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255
 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
 
 namespace Tegra {
-static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
-static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
-static const Xbyak::Reg64 STATE = Xbyak::util::r11;
-static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
-static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
-static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
+static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
 static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
-static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
 static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
 
 static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
-    PARAMETERS,
-    REGISTERS,
     STATE,
-    NEXT_PARAMETER,
     RESULT,
+    PARAMETERS,
     METHOD_ADDRESS,
     BRANCH_HOLDER,
 });
 
-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
@@ -53,32 +48,32 @@ void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
     JITState state{};
     state.maxwell3d = &maxwell3d;
     state.registers = {};
-    state.parameters = parameters.data();
-    program(&state);
+    program(&state, parameters.data());
 }
 
 void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
     const bool is_a_zero = opcode.src_a == 0;
     const bool is_b_zero = opcode.src_b == 0;
     const bool valid_operation = !is_a_zero && !is_b_zero;
-    const bool is_move_operation = !is_a_zero && is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
     const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
 
-    Xbyak::Reg64 src_a;
+    Xbyak::Reg32 src_a;
     Xbyak::Reg32 src_b;
 
-    if (!optimizer.zero_reg_skip) {
-        src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
-        src_b = Compile_GetRegister(opcode.src_b, ebx);
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
     } else {
         if (!is_a_zero) {
-            src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
         }
         if (!is_b_zero) {
-            src_b = Compile_GetRegister(opcode.src_b, ebx);
+            src_b = Compile_GetRegister(opcode.src_b, eax);
         }
     }
-    Xbyak::Label skip_carry{};
 
     bool has_emitted = false;
 
@@ -190,7 +185,8 @@ void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
         opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
         if (next_opcode.has_value()) {
             const auto next = *next_opcode;
-            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
                 return;
             }
         }
@@ -244,10 +240,10 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
 }
 
 void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
-    auto dst = Compile_GetRegister(opcode.src_a, eax);
-    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
 
-    shr(src, al);
+    shr(src, dst.cvt8());
     if (opcode.bf_size != 0 && opcode.bf_size != 31) {
         and_(src, opcode.GetBitfieldMask());
     } else if (opcode.bf_size == 0) {
@@ -263,8 +259,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
 }
 
 void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
-    auto dst = Compile_GetRegister(opcode.src_a, eax);
-    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
 
     if (opcode.bf_src_bit != 0) {
         shr(src, opcode.bf_src_bit);
@@ -273,16 +269,9 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
     if (opcode.bf_size != 31) {
         and_(src, opcode.GetBitfieldMask());
     }
-    shl(src, al);
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
+    shl(src, dst.cvt8());
 
-static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
-    return maxwell3d->GetRegisterValue(method);
-}
-
-static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
-    maxwell3d->CallMethodFromMME(method_address.address, value);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
 }
 
 void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
@@ -302,22 +291,34 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
             sub(result, opcode.immediate * -1);
         }
     }
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
-    mov(Common::X64::ABI_PARAM1, qword[STATE]);
-    mov(Common::X64::ABI_PARAM2, RESULT);
-    Common::X64::CallFarFunction(*this, &Read);
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
-    mov(RESULT, Common::X64::ABI_RETURN.cvt32());
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
     Compile_ProcessResult(opcode.result_operation, opcode.dst);
 }
 
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
 void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
     mov(Common::X64::ABI_PARAM1, qword[STATE]);
     mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
     mov(Common::X64::ABI_PARAM3, value);
     Common::X64::CallFarFunction(*this, &Send);
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
 
     Xbyak::Label dont_process{};
     // Get increment
@@ -329,7 +330,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
     and_(METHOD_ADDRESS, 0xfff);
     shr(ecx, 12);
     and_(ecx, 0x3f);
-    lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
     sal(ecx, 12);
     or_(eax, ecx);
 
@@ -421,19 +422,15 @@ void MacroJITx64Impl::Compile() {
     bool keep_executing = true;
     labels.fill(Xbyak::Label());
 
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
     // JIT state
     mov(STATE, Common::X64::ABI_PARAM1);
-    mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
-                          static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
-    mov(REGISTERS, Common::X64::ABI_PARAM1);
-    add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
     xor_(RESULT, RESULT);
     xor_(METHOD_ADDRESS, METHOD_ADDRESS);
-    xor_(NEXT_PARAMETER, NEXT_PARAMETER);
     xor_(BRANCH_HOLDER, BRANCH_HOLDER);
 
-    mov(dword[REGISTERS + 4], Compile_FetchParameter());
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
 
     // Track get register for zero registers and mark it as no-op
     optimizer.zero_reg_skip = true;
@@ -446,6 +443,9 @@ void MacroJITx64Impl::Compile() {
     // one if our register isn't "dirty"
     optimizer.optimize_for_method_move = true;
 
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
     // Check to see if we can skip emitting certain instructions
     Optimizer_ScanFlags();
 
@@ -463,7 +463,7 @@ void MacroJITx64Impl::Compile() {
 
     L(end_of_code);
 
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
     ret();
     ready();
     program = getCode<ProgramType>();
@@ -537,8 +537,8 @@ bool MacroJITx64Impl::Compile_NextInstruction() {
 }
 
 Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
-    mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
-    inc(NEXT_PARAMETER);
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
     return eax;
 }
 
@@ -547,41 +547,22 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
         // Register 0 is always zero
         xor_(dst, dst);
     } else {
-        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
-    }
-
-    return dst;
-}
-
-Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
-    if (index == 0) {
-        // Register 0 is always zero
-        xor_(dst, dst);
-    } else {
-        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
     }
 
     return dst;
 }
 
-void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
-    Xbyak::Label zero{}, end{};
-    xor_(ecx, ecx);
-    shr(dst, 32);
-    setne(cl);
-    mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
-}
-
 void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
-    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
         // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
         // register.
         if (reg == 0) {
             return;
         }
-        mov(dword[REGISTERS + reg * sizeof(u32)], result);
+        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
     };
-    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
 
     switch (operation) {
     case Macro::ResultOperation::IgnoreAndFetch:
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
index 21ee157cf..a180e7428 100644
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -55,8 +55,6 @@ private:
 
     Xbyak::Reg32 Compile_FetchParameter();
     Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
-    Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
-    void Compile_WriteCarry(Xbyak::Reg64 dst);
 
     void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
     void Compile_Send(Xbyak::Reg32 value);
@@ -67,11 +65,10 @@ private:
     struct JITState {
         Engines::Maxwell3D* maxwell3d{};
         std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
-        const u32* parameters{};
         u32 carry_flag{};
     };
     static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
-    using ProgramType = void (*)(JITState*);
+    using ProgramType = void (*)(JITState*, const u32*);
 
     struct OptimizerState {
         bool can_skip_carry{};
@@ -79,14 +76,15 @@ private:
         bool zero_reg_skip{};
         bool skip_dummy_addimmediate{};
         bool optimize_for_method_move{};
+        bool enable_asserts{};
     };
     OptimizerState optimizer{};
 
     std::optional<Macro::Opcode> next_opcode{};
     ProgramType program{nullptr};
 
-    std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
-    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
     Xbyak::Label end_of_code{};
 
     bool is_delay_slot{};
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index dbee9f634..ff5505d12 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si
     return range == inner_size;
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer,
+                              const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
     }
 }
 
-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                     const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                               const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
     }
 }
 
-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
                                      const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                              const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlock(src_addr, tmp_buffer.data(), size);
-    WriteBlock(dest_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
-    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 0ddd52d5a..87658e87a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -79,9 +79,9 @@ public:
      * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
      * Flushes and Invalidations, respectively to each operation.
      */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,9 +93,9 @@ public:
      * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
      * being flushed.
      */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * IsGranularRange checks if a gpu region can be simply read with a pointer
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 2f75f8801..e12dab899 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -220,8 +220,8 @@ private:
             return cache_begin < addr_end && addr_begin < cache_end;
         };
 
-        const u64 page_end = addr_end >> PAGE_SHIFT;
-        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
             const auto& it = cached_queries.find(page);
             if (it == std::end(cached_queries)) {
                 continue;
@@ -242,14 +242,14 @@ private:
     /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
     CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
         rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
-        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
         return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
                                                   host_ptr);
     }
 
     /// Tries to a get a cached query. Returns nullptr on failure.
     CachedQuery* TryGet(VAddr addr) {
-        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
         const auto it = cached_queries.find(page);
         if (it == std::end(cached_queries)) {
             return nullptr;
@@ -268,7 +268,7 @@ private:
     }
 
     static constexpr std::uintptr_t PAGE_SIZE = 4096;
-    static constexpr unsigned PAGE_SHIFT = 12;
+    static constexpr unsigned PAGE_BITS = 12;
 
     Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp
deleted file mode 100644
index 093b2cdf4..000000000
--- a/src/video_core/rasterizer_cache.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/rasterizer_cache.h"
-
-RasterizerCacheObject::~RasterizerCacheObject() = default;
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
deleted file mode 100644
index 096ee337c..000000000
--- a/src/video_core/rasterizer_cache.h
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-
-#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range_core.hpp>
-
-#include "common/common_types.h"
-#include "core/settings.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
-
-class RasterizerCacheObject {
-public:
-    explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
-
-    virtual ~RasterizerCacheObject();
-
-    VAddr GetCpuAddr() const {
-        return cpu_addr;
-    }
-
-    /// Gets the size of the shader in guest memory, required for cache management
-    virtual std::size_t GetSizeInBytes() const = 0;
-
-    /// Sets whether the cached object should be considered registered
-    void SetIsRegistered(bool registered) {
-        is_registered = registered;
-    }
-
-    /// Returns true if the cached object is registered
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    /// Returns true if the cached object is dirty
-    bool IsDirty() const {
-        return is_dirty;
-    }
-
-    /// Returns ticks from when this cached object was last modified
-    u64 GetLastModifiedTicks() const {
-        return last_modified_ticks;
-    }
-
-    /// Marks an object as recently modified, used to specify whether it is clean or dirty
-    template <class T>
-    void MarkAsModified(bool dirty, T& cache) {
-        is_dirty = dirty;
-        last_modified_ticks = cache.GetModifiedTicks();
-    }
-
-    void SetMemoryMarked(bool is_memory_marked_) {
-        is_memory_marked = is_memory_marked_;
-    }
-
-    bool IsMemoryMarked() const {
-        return is_memory_marked;
-    }
-
-    void SetSyncPending(bool is_sync_pending_) {
-        is_sync_pending = is_sync_pending_;
-    }
-
-    bool IsSyncPending() const {
-        return is_sync_pending;
-    }
-
-private:
-    bool is_registered{};      ///< Whether the object is currently registered with the cache
-    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
-    bool is_memory_marked{};   ///< Whether the object is marking rasterizer memory.
-    bool is_sync_pending{};    ///< Whether the object is pending deletion.
-    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
-    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
-};
-
-template <class T>
-class RasterizerCache : NonCopyable {
-    friend class RasterizerCacheObject;
-
-public:
-    explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
-
-    /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            FlushObject(object);
-        }
-    }
-
-    /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            if (!object->IsRegistered()) {
-                // Skip duplicates
-                continue;
-            }
-            Unregister(object);
-        }
-    }
-
-    void OnCPUWrite(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
-            if (object->IsRegistered()) {
-                UnmarkMemory(object);
-                object->SetSyncPending(true);
-                marked_for_unregister.emplace_back(object);
-            }
-        }
-    }
-
-    void SyncGuestHost() {
-        std::lock_guard lock{mutex};
-
-        for (const auto& object : marked_for_unregister) {
-            if (object->IsRegistered()) {
-                object->SetSyncPending(false);
-                Unregister(object);
-            }
-        }
-        marked_for_unregister.clear();
-    }
-
-    /// Invalidates everything in the cache
-    void InvalidateAll() {
-        std::lock_guard lock{mutex};
-
-        while (interval_cache.begin() != interval_cache.end()) {
-            Unregister(*interval_cache.begin()->second.begin());
-        }
-    }
-
-protected:
-    /// Tries to get an object from the cache with the specified cache address
-    T TryGet(VAddr addr) const {
-        const auto iter = map_cache.find(addr);
-        if (iter != map_cache.end())
-            return iter->second;
-        return nullptr;
-    }
-
-    /// Register an object into the cache
-    virtual void Register(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(true);
-        interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetCpuAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
-        object->SetMemoryMarked(true);
-    }
-
-    /// Unregisters an object from the cache
-    virtual void Unregister(const T& object) {
-        std::lock_guard lock{mutex};
-
-        UnmarkMemory(object);
-        object->SetIsRegistered(false);
-        if (object->IsSyncPending()) {
-            marked_for_unregister.remove(object);
-            object->SetSyncPending(false);
-        }
-        const VAddr addr = object->GetCpuAddr();
-        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(addr);
-    }
-
-    void UnmarkMemory(const T& object) {
-        if (!object->IsMemoryMarked()) {
-            return;
-        }
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
-        object->SetMemoryMarked(false);
-    }
-
-    /// Returns a ticks counter used for tracking when cached objects were last modified
-    u64 GetModifiedTicks() {
-        std::lock_guard lock{mutex};
-
-        return ++modified_ticks;
-    }
-
-    virtual void FlushObjectInner(const T& object) = 0;
-
-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        std::lock_guard lock{mutex};
-
-        if (!object->IsDirty()) {
-            return;
-        }
-        FlushObjectInner(object);
-        object->MarkAsModified(false, *this);
-    }
-
-    std::recursive_mutex mutex;
-
-private:
-    /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
-        if (size == 0) {
-            return {};
-        }
-
-        std::vector<T> objects;
-        const ObjectInterval interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
-            for (auto& cached_object : pair.second) {
-                if (!cached_object) {
-                    continue;
-                }
-                objects.push_back(cached_object);
-            }
-        }
-
-        std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
-            return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
-        });
-
-        return objects;
-    }
-
-    using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
-    using ObjectInterval = typename IntervalCache::interval_type;
-
-    static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetCpuAddr(),
-                                          object->GetCpuAddr() + object->GetSizeInBytes());
-    }
-
-    ObjectCache map_cache;
-    IntervalCache interval_cache; ///< Cache of objects
-    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
-    VideoCore::RasterizerInterface& rasterizer;
-    std::list<T> marked_for_unregister;
-};
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..eb5158407
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2073 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+    ASSERT(component < 4);
+    return component["xyzw"];
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+    const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+    if (meta && meta->precise) {
+        return ".PREC";
+    }
+    return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+    switch (attribute) {
+    case PixelImap::Perspective:
+        return "";
+    case PixelImap::Constant:
+        return "FLAT ";
+    case PixelImap::ScreenLinear:
+        return "NOPERSPECTIVE ";
+    case PixelImap::Unused:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return "1D";
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return "BUFFER";
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return "ARRAY1D";
+    case Tegra::Shader::ImageType::Texture2D:
+        return "2D";
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return "ARRAY2D";
+    case Tegra::Shader::ImageType::Texture3D:
+        return "3D";
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "SSY";
+    case MetaStackClass::Pbk:
+        return "PBK";
+    }
+    UNREACHABLE();
+    return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+        return "POINTS";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+        return "LINES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+        return "LINES_ADJACENCY";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+        return "TRIANGLES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+        return "TRIANGLES_ADJACENCY";
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return "POINTS";
+    }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+    switch (topology) {
+    case Tegra::Shader::OutputTopology::PointList:
+        return "POINTS";
+    case Tegra::Shader::OutputTopology::LineStrip:
+        return "LINE_STRIP";
+    case Tegra::Shader::OutputTopology::TriangleStrip:
+        return "TRIANGLE_STRIP";
+    default:
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        return "points";
+    }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+    case ShaderType::Geometry:
+        return "vertex";
+    case ShaderType::Fragment:
+        return "fragment";
+    case ShaderType::Compute:
+        return "invocation";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+    if (meta.sampler.is_buffer) {
+        return "BUFFER";
+    }
+    std::string type;
+    if (meta.sampler.is_shadow) {
+        type += "SHADOW";
+    }
+    if (meta.sampler.is_array) {
+        type += "ARRAY";
+    }
+    type += [&meta] {
+        switch (meta.sampler.type) {
+        case Tegra::Shader::TextureType::Texture1D:
+            return "1D";
+        case Tegra::Shader::TextureType::Texture2D:
+            return "2D";
+        case Tegra::Shader::TextureType::Texture3D:
+            return "3D";
+        case Tegra::Shader::TextureType::TextureCube:
+            return "CUBE";
+        }
+        UNREACHABLE();
+        return "2D";
+    }();
+    return type;
+}
+
+std::string GlobalMemoryName(const GlobalMemoryBase& base) {
+    return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
+}
+
+class ARBDecompiler final {
+public:
+    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                           ShaderType stage, std::string_view identifier);
+
+    std::string Code() const {
+        return shader_source;
+    }
+
+private:
+    void DeclareHeader();
+    void DeclareVertex();
+    void DeclareGeometry();
+    void DeclareFragment();
+    void DeclareCompute();
+    void DeclareInputAttributes();
+    void DeclareOutputAttributes();
+    void DeclareLocalMemory();
+    void DeclareGlobalMemory();
+    void DeclareConstantBuffers();
+    void DeclareRegisters();
+    void DeclareTemporaries();
+    void DeclarePredicates();
+    void DeclareInternalFlags();
+
+    void InitializeVariables();
+
+    void DecompileAST();
+    void DecompileBranchMode();
+
+    void VisitAST(const ASTNode& node);
+    std::string VisitExpression(const Expr& node);
+
+    void VisitBlock(const NodeBlock& bb);
+
+    std::string Visit(const Node& node);
+
+    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::string BuildAoffi(Operation);
+    void Exit();
+
+    std::string Assign(Operation);
+    std::string Select(Operation);
+    std::string FClamp(Operation);
+    std::string FCastHalf0(Operation);
+    std::string FCastHalf1(Operation);
+    std::string FSqrt(Operation);
+    std::string FSwizzleAdd(Operation);
+    std::string HAdd2(Operation);
+    std::string HMul2(Operation);
+    std::string HFma2(Operation);
+    std::string HAbsolute(Operation);
+    std::string HNegate(Operation);
+    std::string HClamp(Operation);
+    std::string HCastFloat(Operation);
+    std::string HUnpack(Operation);
+    std::string HMergeF32(Operation);
+    std::string HMergeH0(Operation);
+    std::string HMergeH1(Operation);
+    std::string HPack2(Operation);
+    std::string LogicalAssign(Operation);
+    std::string LogicalPick2(Operation);
+    std::string LogicalAnd2(Operation);
+    std::string FloatOrdered(Operation);
+    std::string FloatUnordered(Operation);
+    std::string LogicalAddCarry(Operation);
+    std::string Texture(Operation);
+    std::string TextureGather(Operation);
+    std::string TextureQueryDimensions(Operation);
+    std::string TextureQueryLod(Operation);
+    std::string TexelFetch(Operation);
+    std::string TextureGradient(Operation);
+    std::string ImageLoad(Operation);
+    std::string ImageStore(Operation);
+    std::string Branch(Operation);
+    std::string BranchIndirect(Operation);
+    std::string PushFlowStack(Operation);
+    std::string PopFlowStack(Operation);
+    std::string Exit(Operation);
+    std::string Discard(Operation);
+    std::string EmitVertex(Operation);
+    std::string EndPrimitive(Operation);
+    std::string InvocationId(Operation);
+    std::string YNegate(Operation);
+    std::string ThreadId(Operation);
+    std::string ShuffleIndexed(Operation);
+    std::string Barrier(Operation);
+    std::string MemoryBarrierGroup(Operation);
+    std::string MemoryBarrierGlobal(Operation);
+
+    template <const std::string_view& op>
+    std::string Unary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Binary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Trinary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]), Visit(operation[2]));
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool unordered>
+    std::string FloatComparison(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+        AddLine("MOV.S {}, 0;", temporary);
+        AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        if constexpr (unordered) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+        } else if (op == SNE_F) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+        }
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool is_nan>
+    std::string HalfComparison(Operation operation) {
+        std::string tmp1 = AllocVectorTemporary();
+        const std::string tmp2 = AllocVectorTemporary();
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        AddLine("UP2H.F {}, {};", tmp1, op_a);
+        AddLine("UP2H.F {}, {};", tmp2, op_b);
+        AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+        AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+        AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+        AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+        AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+        if constexpr (is_nan) {
+            AddLine("MOVC.F RC.x, {};", op_a);
+            AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+            AddLine("MOVC.F RC.x, {};", op_b);
+            AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+        }
+        return tmp1;
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string AtomicImage(Operation operation) {
+        const auto& meta = std::get<MetaImage>(operation.GetMeta());
+        const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+        const std::size_t num_coords = operation.GetOperandsCount();
+        const std::size_t num_values = meta.values.size();
+
+        const std::string coord = AllocVectorTemporary();
+        const std::string value = AllocVectorTemporary();
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+        }
+        for (std::size_t i = 0; i < num_values; ++i) {
+            AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+        }
+
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+                image_id, ImageType(meta.image.type));
+        return fmt::format("{}.x", coord);
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string Atomic(Operation operation) {
+        std::string temporary = AllocTemporary();
+        std::string address;
+        std::string_view opname;
+        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                    Visit(gmem->GetBaseAddress()));
+            address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
+            opname = "ATOMB";
+        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+            opname = "ATOMS";
+        } else {
+            UNREACHABLE();
+            return "{0, 0, 0, 0}";
+        }
+        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        return temporary;
+    }
+
+    template <char type>
+    std::string Negate(Operation operation) {
+        std::string temporary = AllocTemporary();
+        if constexpr (type == 'F') {
+            AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+        } else {
+            AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Absolute(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <char type>
+    std::string BitfieldInsert(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+        AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+                Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char type>
+    std::string BitfieldExtract(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+        AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char swizzle>
+    std::string LocalInvocationId(Operation) {
+        return fmt::format("invocation.localid.{}", swizzle);
+    }
+
+    template <char swizzle>
+    std::string WorkGroupId(Operation) {
+        return fmt::format("invocation.groupid.{}", swizzle);
+    }
+
+    template <char c1, char c2>
+    std::string ThreadMask(Operation) {
+        return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+    }
+
+    template <typename... Args>
+    void AddExpression(std::string_view text, Args&&... args) {
+        shader_source += fmt::format(text, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(text, std::forward<Args>(args)...);
+        shader_source += '\n';
+    }
+
+    std::string AllocTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}.x", num_temporaries++);
+    }
+
+    std::string AllocVectorTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}", num_temporaries++);
+    }
+
+    void ResetTemporaries() noexcept {
+        num_temporaries = 0;
+    }
+
+    const Device& device;
+    const ShaderIR& ir;
+    const Registry& registry;
+    const ShaderType stage;
+
+    std::size_t num_temporaries = 0;
+    std::size_t max_temporaries = 0;
+
+    std::string shader_source;
+
+    static constexpr std::string_view ADD_F32 = "ADD.F32";
+    static constexpr std::string_view ADD_S = "ADD.S";
+    static constexpr std::string_view ADD_U = "ADD.U";
+    static constexpr std::string_view MUL_F32 = "MUL.F32";
+    static constexpr std::string_view MUL_S = "MUL.S";
+    static constexpr std::string_view MUL_U = "MUL.U";
+    static constexpr std::string_view DIV_F32 = "DIV.F32";
+    static constexpr std::string_view DIV_S = "DIV.S";
+    static constexpr std::string_view DIV_U = "DIV.U";
+    static constexpr std::string_view MAD_F32 = "MAD.F32";
+    static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+    static constexpr std::string_view COS_F32 = "COS.F32";
+    static constexpr std::string_view SIN_F32 = "SIN.F32";
+    static constexpr std::string_view EX2_F32 = "EX2.F32";
+    static constexpr std::string_view LG2_F32 = "LG2.F32";
+    static constexpr std::string_view SLT_F = "SLT.F32";
+    static constexpr std::string_view SLT_S = "SLT.S";
+    static constexpr std::string_view SLT_U = "SLT.U";
+    static constexpr std::string_view SEQ_F = "SEQ.F32";
+    static constexpr std::string_view SEQ_S = "SEQ.S";
+    static constexpr std::string_view SEQ_U = "SEQ.U";
+    static constexpr std::string_view SLE_F = "SLE.F32";
+    static constexpr std::string_view SLE_S = "SLE.S";
+    static constexpr std::string_view SLE_U = "SLE.U";
+    static constexpr std::string_view SGT_F = "SGT.F32";
+    static constexpr std::string_view SGT_S = "SGT.S";
+    static constexpr std::string_view SGT_U = "SGT.U";
+    static constexpr std::string_view SNE_F = "SNE.F32";
+    static constexpr std::string_view SNE_S = "SNE.S";
+    static constexpr std::string_view SNE_U = "SNE.U";
+    static constexpr std::string_view SGE_F = "SGE.F32";
+    static constexpr std::string_view SGE_S = "SGE.S";
+    static constexpr std::string_view SGE_U = "SGE.U";
+    static constexpr std::string_view AND_S = "AND.S";
+    static constexpr std::string_view AND_U = "AND.U";
+    static constexpr std::string_view TRUNC_F = "TRUNC.F";
+    static constexpr std::string_view TRUNC_S = "TRUNC.S";
+    static constexpr std::string_view TRUNC_U = "TRUNC.U";
+    static constexpr std::string_view SHL_S = "SHL.S";
+    static constexpr std::string_view SHL_U = "SHL.U";
+    static constexpr std::string_view SHR_S = "SHR.S";
+    static constexpr std::string_view SHR_U = "SHR.U";
+    static constexpr std::string_view OR_S = "OR.S";
+    static constexpr std::string_view OR_U = "OR.U";
+    static constexpr std::string_view XOR_S = "XOR.S";
+    static constexpr std::string_view XOR_U = "XOR.U";
+    static constexpr std::string_view NOT_S = "NOT.S";
+    static constexpr std::string_view NOT_U = "NOT.U";
+    static constexpr std::string_view BTC_S = "BTC.S";
+    static constexpr std::string_view BTC_U = "BTC.U";
+    static constexpr std::string_view BTFM_S = "BTFM.S";
+    static constexpr std::string_view BTFM_U = "BTFM.U";
+    static constexpr std::string_view ROUND_F = "ROUND.F";
+    static constexpr std::string_view CEIL_F = "CEIL.F";
+    static constexpr std::string_view FLR_F = "FLR.F";
+    static constexpr std::string_view I2F_S = "I2F.S";
+    static constexpr std::string_view I2F_U = "I2F.U";
+    static constexpr std::string_view MIN_F = "MIN.F";
+    static constexpr std::string_view MIN_S = "MIN.S";
+    static constexpr std::string_view MIN_U = "MIN.U";
+    static constexpr std::string_view MAX_F = "MAX.F";
+    static constexpr std::string_view MAX_S = "MAX.S";
+    static constexpr std::string_view MAX_U = "MAX.U";
+    static constexpr std::string_view MOV_U = "MOV.U";
+    static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+    static constexpr std::string_view TGALL_U = "TGALL.U";
+    static constexpr std::string_view TGANY_U = "TGANY.U";
+    static constexpr std::string_view TGEQ_U = "TGEQ.U";
+    static constexpr std::string_view EXCH = "EXCH";
+    static constexpr std::string_view ADD = "ADD";
+    static constexpr std::string_view MIN = "MIN";
+    static constexpr std::string_view MAX = "MAX";
+    static constexpr std::string_view AND = "AND";
+    static constexpr std::string_view OR = "OR";
+    static constexpr std::string_view XOR = "XOR";
+    static constexpr std::string_view U32 = "U32";
+    static constexpr std::string_view S32 = "S32";
+
+    static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+    using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+    static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+        &ARBDecompiler::Assign,
+
+        &ARBDecompiler::Select,
+
+        &ARBDecompiler::Binary<ADD_F32>,
+        &ARBDecompiler::Binary<MUL_F32>,
+        &ARBDecompiler::Binary<DIV_F32>,
+        &ARBDecompiler::Trinary<MAD_F32>,
+        &ARBDecompiler::Negate<'F'>,
+        &ARBDecompiler::Absolute<'F'>,
+        &ARBDecompiler::FClamp,
+        &ARBDecompiler::FCastHalf0,
+        &ARBDecompiler::FCastHalf1,
+        &ARBDecompiler::Binary<MIN_F>,
+        &ARBDecompiler::Binary<MAX_F>,
+        &ARBDecompiler::Unary<COS_F32>,
+        &ARBDecompiler::Unary<SIN_F32>,
+        &ARBDecompiler::Unary<EX2_F32>,
+        &ARBDecompiler::Unary<LG2_F32>,
+        &ARBDecompiler::Unary<RSQ_F32>,
+        &ARBDecompiler::FSqrt,
+        &ARBDecompiler::Unary<ROUND_F>,
+        &ARBDecompiler::Unary<FLR_F>,
+        &ARBDecompiler::Unary<CEIL_F>,
+        &ARBDecompiler::Unary<TRUNC_F>,
+        &ARBDecompiler::Unary<I2F_S>,
+        &ARBDecompiler::Unary<I2F_U>,
+        &ARBDecompiler::FSwizzleAdd,
+
+        &ARBDecompiler::Binary<ADD_S>,
+        &ARBDecompiler::Binary<MUL_S>,
+        &ARBDecompiler::Binary<DIV_S>,
+        &ARBDecompiler::Negate<'S'>,
+        &ARBDecompiler::Absolute<'S'>,
+        &ARBDecompiler::Binary<MIN_S>,
+        &ARBDecompiler::Binary<MAX_S>,
+
+        &ARBDecompiler::Unary<TRUNC_S>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_S>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_S>,
+        &ARBDecompiler::Binary<AND_S>,
+        &ARBDecompiler::Binary<OR_S>,
+        &ARBDecompiler::Binary<XOR_S>,
+        &ARBDecompiler::Unary<NOT_S>,
+        &ARBDecompiler::BitfieldInsert<'S'>,
+        &ARBDecompiler::BitfieldExtract<'S'>,
+        &ARBDecompiler::Unary<BTC_S>,
+        &ARBDecompiler::Unary<BTFM_S>,
+
+        &ARBDecompiler::Binary<ADD_U>,
+        &ARBDecompiler::Binary<MUL_U>,
+        &ARBDecompiler::Binary<DIV_U>,
+        &ARBDecompiler::Binary<MIN_U>,
+        &ARBDecompiler::Binary<MAX_U>,
+        &ARBDecompiler::Unary<TRUNC_U>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::BitfieldInsert<'U'>,
+        &ARBDecompiler::BitfieldExtract<'U'>,
+        &ARBDecompiler::Unary<BTC_U>,
+        &ARBDecompiler::Unary<BTFM_U>,
+
+        &ARBDecompiler::HAdd2,
+        &ARBDecompiler::HMul2,
+        &ARBDecompiler::HFma2,
+        &ARBDecompiler::HAbsolute,
+        &ARBDecompiler::HNegate,
+        &ARBDecompiler::HClamp,
+        &ARBDecompiler::HCastFloat,
+        &ARBDecompiler::HUnpack,
+        &ARBDecompiler::HMergeF32,
+        &ARBDecompiler::HMergeH0,
+        &ARBDecompiler::HMergeH1,
+        &ARBDecompiler::HPack2,
+
+        &ARBDecompiler::LogicalAssign,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::LogicalPick2,
+        &ARBDecompiler::LogicalAnd2,
+
+        &ARBDecompiler::FloatComparison<SLT_F, false>,
+        &ARBDecompiler::FloatComparison<SEQ_F, false>,
+        &ARBDecompiler::FloatComparison<SLE_F, false>,
+        &ARBDecompiler::FloatComparison<SGT_F, false>,
+        &ARBDecompiler::FloatComparison<SNE_F, false>,
+        &ARBDecompiler::FloatComparison<SGE_F, false>,
+        &ARBDecompiler::FloatOrdered,
+        &ARBDecompiler::FloatUnordered,
+        &ARBDecompiler::FloatComparison<SLT_F, true>,
+        &ARBDecompiler::FloatComparison<SEQ_F, true>,
+        &ARBDecompiler::FloatComparison<SLE_F, true>,
+        &ARBDecompiler::FloatComparison<SGT_F, true>,
+        &ARBDecompiler::FloatComparison<SNE_F, true>,
+        &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+        &ARBDecompiler::Binary<SLT_S>,
+        &ARBDecompiler::Binary<SEQ_S>,
+        &ARBDecompiler::Binary<SLE_S>,
+        &ARBDecompiler::Binary<SGT_S>,
+        &ARBDecompiler::Binary<SNE_S>,
+        &ARBDecompiler::Binary<SGE_S>,
+
+        &ARBDecompiler::Binary<SLT_U>,
+        &ARBDecompiler::Binary<SEQ_U>,
+        &ARBDecompiler::Binary<SLE_U>,
+        &ARBDecompiler::Binary<SGT_U>,
+        &ARBDecompiler::Binary<SNE_U>,
+        &ARBDecompiler::Binary<SGE_U>,
+
+        &ARBDecompiler::LogicalAddCarry,
+
+        &ARBDecompiler::HalfComparison<SLT_F, false>,
+        &ARBDecompiler::HalfComparison<SEQ_F, false>,
+        &ARBDecompiler::HalfComparison<SLE_F, false>,
+        &ARBDecompiler::HalfComparison<SGT_F, false>,
+        &ARBDecompiler::HalfComparison<SNE_F, false>,
+        &ARBDecompiler::HalfComparison<SGE_F, false>,
+        &ARBDecompiler::HalfComparison<SLT_F, true>,
+        &ARBDecompiler::HalfComparison<SEQ_F, true>,
+        &ARBDecompiler::HalfComparison<SLE_F, true>,
+        &ARBDecompiler::HalfComparison<SGT_F, true>,
+        &ARBDecompiler::HalfComparison<SNE_F, true>,
+        &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::TextureGather,
+        &ARBDecompiler::TextureQueryDimensions,
+        &ARBDecompiler::TextureQueryLod,
+        &ARBDecompiler::TexelFetch,
+        &ARBDecompiler::TextureGradient,
+
+        &ARBDecompiler::ImageLoad,
+        &ARBDecompiler::ImageStore,
+
+        &ARBDecompiler::AtomicImage<ADD, U32>,
+        &ARBDecompiler::AtomicImage<AND, U32>,
+        &ARBDecompiler::AtomicImage<OR, U32>,
+        &ARBDecompiler::AtomicImage<XOR, U32>,
+        &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, U32>,
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, S32>,
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Branch,
+        &ARBDecompiler::BranchIndirect,
+        &ARBDecompiler::PushFlowStack,
+        &ARBDecompiler::PopFlowStack,
+        &ARBDecompiler::Exit,
+        &ARBDecompiler::Discard,
+
+        &ARBDecompiler::EmitVertex,
+        &ARBDecompiler::EndPrimitive,
+
+        &ARBDecompiler::InvocationId,
+        &ARBDecompiler::YNegate,
+        &ARBDecompiler::LocalInvocationId<'x'>,
+        &ARBDecompiler::LocalInvocationId<'y'>,
+        &ARBDecompiler::LocalInvocationId<'z'>,
+        &ARBDecompiler::WorkGroupId<'x'>,
+        &ARBDecompiler::WorkGroupId<'y'>,
+        &ARBDecompiler::WorkGroupId<'z'>,
+
+        &ARBDecompiler::Unary<TGBALLOT_U>,
+        &ARBDecompiler::Unary<TGALL_U>,
+        &ARBDecompiler::Unary<TGANY_U>,
+        &ARBDecompiler::Unary<TGEQ_U>,
+
+        &ARBDecompiler::ThreadId,
+        &ARBDecompiler::ThreadMask<'e', 'q'>,
+        &ARBDecompiler::ThreadMask<'g', 'e'>,
+        &ARBDecompiler::ThreadMask<'g', 't'>,
+        &ARBDecompiler::ThreadMask<'l', 'e'>,
+        &ARBDecompiler::ThreadMask<'l', 't'>,
+        &ARBDecompiler::ShuffleIndexed,
+
+        &ARBDecompiler::Barrier,
+        &ARBDecompiler::MemoryBarrierGroup,
+        &ARBDecompiler::MemoryBarrierGlobal,
+    };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                             ShaderType stage, std::string_view identifier)
+    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+    AddLine("TEMP RC;");
+    AddLine("TEMP FSWZA[4];");
+    AddLine("TEMP FSWZB[4];");
+    if (ir.IsDecompiled()) {
+        DecompileAST();
+    } else {
+        DecompileBranchMode();
+    }
+    AddLine("END");
+
+    const std::string code = std::move(shader_source);
+    DeclareHeader();
+    DeclareVertex();
+    DeclareGeometry();
+    DeclareFragment();
+    DeclareCompute();
+    DeclareInputAttributes();
+    DeclareOutputAttributes();
+    DeclareLocalMemory();
+    DeclareGlobalMemory();
+    DeclareConstantBuffers();
+    DeclareRegisters();
+    DeclareTemporaries();
+    DeclarePredicates();
+    DeclareInternalFlags();
+
+    shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+        return "vp";
+    case ShaderType::Geometry:
+        return "gp";
+    case ShaderType::Fragment:
+        return "fp";
+    case ShaderType::Compute:
+        return "cp";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+void ARBDecompiler::DeclareHeader() {
+    AddLine("!!NV{}5.0", HeaderStageName(stage));
+    // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+    AddLine("OPTION NV_internal;");
+    AddLine("OPTION NV_gpu_program_fp64;");
+    AddLine("OPTION NV_shader_storage_buffer;");
+    AddLine("OPTION NV_shader_thread_group;");
+    if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+        AddLine("OPTION NV_shader_thread_shuffle;");
+    }
+    if (stage == ShaderType::Vertex) {
+        if (device.HasNvViewportArray2()) {
+            AddLine("OPTION NV_viewport_array2;");
+        }
+    }
+    if (stage == ShaderType::Fragment) {
+        AddLine("OPTION ARB_draw_buffers;");
+    }
+    if (device.HasImageLoadFormatted()) {
+        AddLine("OPTION EXT_shader_image_load_formatted;");
+    }
+}
+
+void ARBDecompiler::DeclareVertex() {
+    if (stage != ShaderType::Vertex) {
+        return;
+    }
+    AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+    if (stage != ShaderType::Geometry) {
+        return;
+    }
+    const auto& info = registry.GetGraphicsInfo();
+    const auto& header = ir.GetHeader();
+    AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+    AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+    AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+    AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+    if (stage != ShaderType::Fragment) {
+        return;
+    }
+    AddLine("OUTPUT result_color7 = result.color[7];");
+    AddLine("OUTPUT result_color6 = result.color[6];");
+    AddLine("OUTPUT result_color5 = result.color[5];");
+    AddLine("OUTPUT result_color4 = result.color[4];");
+    AddLine("OUTPUT result_color3 = result.color[3];");
+    AddLine("OUTPUT result_color2 = result.color[2];");
+    AddLine("OUTPUT result_color1 = result.color[1];");
+    AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+    if (stage != ShaderType::Compute) {
+        return;
+    }
+    const ComputeInfo& info = registry.GetComputeInfo();
+    AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+            info.workgroup_size[2]);
+    if (info.shared_memory_size_in_words > 0) {
+        const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+        AddLine("SHARED_MEMORY {};", size_in_bytes);
+        AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+    }
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    const std::string_view stage_name = StageInputName(stage);
+    for (const auto attribute : ir.GetInputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+
+        std::string_view suffix;
+        if (stage == ShaderType::Fragment) {
+            const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+            if (input_mode == PixelImap::Unused) {
+                return;
+            }
+            suffix = GetInputFlags(input_mode);
+        }
+        AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+                index);
+    }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+    }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+    u64 size = 0;
+    if (stage == ShaderType::Compute) {
+        size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+    } else {
+        size = ir.GetHeader().GetLocalMemorySize();
+    }
+    if (size == 0) {
+        return;
+    }
+    const u64 element_count = Common::AlignUp(size, 4) / 4;
+    AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+    u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
+    for (const auto& pair : ir.GetGlobalMemory()) {
+        const auto& base = pair.first;
+        AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+    u32 binding = 0;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("TEMP R{};", gpr);
+    }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+    for (std::size_t i = 0; i < max_temporaries; ++i) {
+        AddLine("TEMP T{};", i);
+    }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("TEMP P{};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+    for (const char* name : INTERNAL_FLAG_NAMES) {
+        AddLine("TEMP {};", name);
+    }
+}
+
+void ARBDecompiler::InitializeVariables() {
+    AddLine("MOV.F32 FSWZA[0], -1;");
+    AddLine("MOV.F32 FSWZA[1], 1;");
+    AddLine("MOV.F32 FSWZA[2], -1;");
+    AddLine("MOV.F32 FSWZA[3], 0;");
+    AddLine("MOV.F32 FSWZB[0], -1;");
+    AddLine("MOV.F32 FSWZB[1], -1;");
+    AddLine("MOV.F32 FSWZB[2], 1;");
+    AddLine("MOV.F32 FSWZB[3], -1;");
+
+    if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+        AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+    }
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+    }
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DecompileAST() {
+    const u32 num_flow_variables = ir.GetASTNumVariables();
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("TEMP F{};", i);
+    }
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+    }
+
+    InitializeVariables();
+
+    VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+    static constexpr u32 FLOW_STACK_SIZE = 20;
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP SSY_TOP;");
+        AddLine("TEMP PBK_TOP;");
+    }
+
+    AddLine("TEMP PC;");
+
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("MOV.U SSY_TOP.x, 0;");
+        AddLine("MOV.U PBK_TOP.x, 0;");
+    }
+
+    InitializeVariables();
+
+    const auto basic_block_end = ir.GetBasicBlocks().end();
+    auto basic_block_it = ir.GetBasicBlocks().begin();
+    const u32 first_address = basic_block_it->first;
+    AddLine("MOV.U PC.x, {};", first_address);
+
+    AddLine("REP;");
+
+    std::size_t num_blocks = 0;
+    while (basic_block_it != basic_block_end) {
+        const auto& [address, bb] = *basic_block_it;
+        ++num_blocks;
+
+        AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+        AddLine("IF NE.x;");
+
+        VisitBlock(bb);
+
+        ++basic_block_it;
+
+        if (basic_block_it != basic_block_end) {
+            const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+            if (!op || op->GetCode() != OperationCode::Branch) {
+                const u32 next_address = basic_block_it->first;
+                AddLine("MOV.U PC.x, {};", next_address);
+                AddLine("CONT;");
+            }
+        }
+
+        AddLine("ELSE;");
+    }
+    AddLine("RET;");
+    while (num_blocks--) {
+        AddLine("ENDIF;");
+    }
+
+    AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+    if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("IF NE.x;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("ENDIF;");
+    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+        AddLine("ELSE;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(ast->nodes);
+    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+        ResetTemporaries();
+    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+        AddLine("REP;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("BRK (NE.x);");
+        AddLine("ENDREP;");
+    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast->condition);
+        if (!is_true) {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("IF NE.x;");
+            ResetTemporaries();
+        }
+        if (ast->kills) {
+            AddLine("KIL TR;");
+        } else {
+            Exit();
+        }
+        if (!is_true) {
+            AddLine("ENDIF;");
+        }
+    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast->condition)) {
+            AddLine("BRK;");
+        } else {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("BRK (NE.x);");
+            ResetTemporaries();
+        }
+    } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+        // Nothing to do
+    } else {
+        UNREACHABLE();
+    }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+    if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprOr>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprNot>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+        return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+    }
+    if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+        return Visit(ir.GetConditionCode(expr->cc));
+    }
+    if (const auto expr = std::get_if<ExprVar>(&*node)) {
+        return fmt::format("F{}.x", expr->var_index);
+    }
+    if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+        return expr->value ? "0xffffffff" : "0";
+    }
+    if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+        return result;
+    }
+    UNREACHABLE();
+    return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+    for (const auto& node : bb) {
+        Visit(node);
+    }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        if (const auto amend_index = operation->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+        if (index >= OPERATION_DECOMPILERS.size()) {
+            UNREACHABLE_MSG("Out of bounds operation: {}", index);
+            return {};
+        }
+        const auto decompiler = OPERATION_DECOMPILERS[index];
+        if (decompiler == nullptr) {
+            UNREACHABLE_MSG("Undefined operation: {}", index);
+            return {};
+        }
+        return (this->*decompiler)(*operation);
+    }
+
+    if (const auto gpr = std::get_if<GprNode>(&*node)) {
+        const u32 index = gpr->GetIndex();
+        if (index == Register::ZeroIndex) {
+            return "{0, 0, 0, 0}.x";
+        }
+        return fmt::format("R{}.x", index);
+    }
+
+    if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+        return fmt::format("CV{}.x", cv->GetIndex());
+    }
+
+    if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+        return temporary;
+    }
+
+    if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        switch (const auto index = predicate->GetIndex(); index) {
+        case Tegra::Shader::Pred::UnusedIndex:
+            AddLine("MOV.S {}, -1;", temporary);
+            break;
+        case Tegra::Shader::Pred::NeverExecute:
+            AddLine("MOV.S {}, 0;", temporary);
+            break;
+        default:
+            AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+            break;
+        }
+        if (predicate->IsNegated()) {
+            AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+        }
+        return temporary;
+    }
+
+    if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+        if (abuf->IsPhysicalBuffer()) {
+            UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+            return "{0, 0, 0, 0}.x";
+        }
+
+        const auto buffer_index = [this, &abuf]() -> std::string {
+            if (stage != ShaderType::Geometry) {
+                return "";
+            }
+            return fmt::format("[{}]", Visit(abuf->GetBuffer()));
+        };
+
+        const Attribute::Index index = abuf->GetIndex();
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (index) {
+        case Attribute::Index::Position: {
+            if (stage == ShaderType::Geometry) {
+                return fmt::format("{}_position[{}].{}", StageInputName(stage),
+                                   Visit(abuf->GetBuffer()), swizzle);
+            } else {
+                return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+            }
+        }
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            ASSERT(stage == ShaderType::Vertex);
+            switch (element) {
+            case 2:
+                return "vertex.instance";
+            case 3:
+                return "vertex.id";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            break;
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "fragment.pointcoord.x";
+            case 1:
+                return "fragment.pointcoord.y";
+            }
+            UNIMPLEMENTED();
+            break;
+        case Attribute::Index::FrontFacing: {
+            ASSERT(stage == ShaderType::Fragment);
+            ASSERT(element == 3);
+            const std::string temporary = AllocVectorTemporary();
+            AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+            AddLine("MOV.U.CC RC.x, -RC;");
+            AddLine("MOV.S {}.x, 0;", temporary);
+            AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+            return fmt::format("{}.x", temporary);
+        }
+        default:
+            if (IsGenericAttribute(index)) {
+                if (stage == ShaderType::Geometry) {
+                    return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+                                       Visit(abuf->GetBuffer()), swizzle);
+                } else {
+                    return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+                                       GetGenericAttributeIndex(index), swizzle);
+                }
+            }
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            break;
+        }
+        return "{0, 0, 0, 0}.x";
+    }
+
+    if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+        std::string offset_string;
+        const auto& offset = cbuf->GetOffset();
+        if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+            offset_string = std::to_string(imm->GetValue());
+        } else {
+            offset_string = Visit(offset);
+        }
+        std::string temporary = AllocTemporary();
+        AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+        return temporary;
+    }
+
+    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        return temporary;
+    }
+
+    if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+        std::string temporary = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+        AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto smem = std::get_if<SmemNode>(&*node)) {
+        std::string temporary = Visit(smem->GetAddress());
+        AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    }
+
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        if (const auto amend_index = conditional->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+        AddLine("IF NE.x;");
+        VisitBlock(conditional->GetCode());
+        AddLine("ENDIF;");
+        return {};
+    }
+
+    if (const auto cmt = std::get_if<CommentNode>(&*node)) {
+        // Uncommenting this will generate invalid code. GLASM lacks comments.
+        // AddLine("// {}", cmt->GetText());
+        return {};
+    }
+
+    UNIMPLEMENTED();
+    return {};
+}
+
+std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
+                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
+
+    const std::size_t count = operation.GetOperandsCount();
+    std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    if (meta.sampler.is_array) {
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+    }
+    if (meta.sampler.is_shadow) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+    }
+    return {std::move(temporary), i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    if (meta.aoffi.empty()) {
+        return {};
+    }
+    const std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (auto& node : meta.aoffi) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+    }
+    return fmt::format(", offset({})", temporary);
+}
+
+void ARBDecompiler::Exit() {
+    if (stage != ShaderType::Fragment) {
+        AddLine("RET;");
+        return;
+    }
+
+    const auto safe_get_register = [this](u32 reg) -> std::string {
+        // TODO(Rodrigo): Replace with contains once C++20 releases
+        const auto& used_registers = ir.GetRegisters();
+        if (used_registers.find(reg) != used_registers.end()) {
+            return fmt::format("R{}.x", reg);
+        }
+        return "{0, 0, 0, 0}.x";
+    };
+
+    const auto& header = ir.GetHeader();
+    u32 current_reg = 0;
+    for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        for (u32 component = 0; component < 4; ++component) {
+            if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                continue;
+            }
+            AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+                    safe_get_register(current_reg));
+            ++current_reg;
+        }
+    }
+    if (header.ps.omap.depth) {
+        AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+    }
+
+    AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string dest_name;
+    if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+        if (gpr->GetIndex() == Register::ZeroIndex) {
+            // Writing to Register::ZeroIndex is a no op
+            return {};
+        }
+        dest_name = fmt::format("R{}.x", gpr->GetIndex());
+    } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (const Attribute::Index index = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            dest_name = fmt::format("result.position.{}", swizzle);
+            break;
+        case Attribute::Index::LayerViewportPointSize:
+            switch (element) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+            case 2:
+                if (!device.HasNvViewportArray2()) {
+                    LOG_ERROR(
+                        Render_OpenGL,
+                        "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+                    return {};
+                }
+                dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+                break;
+            case 3:
+                dest_name = "result.pointsize.x";
+                break;
+            }
+            break;
+        case Attribute::Index::ClipDistances0123:
+            dest_name = fmt::format("result.clip[{}].x", element);
+            break;
+        case Attribute::Index::ClipDistances4567:
+            dest_name = fmt::format("result.clip[{}].x", element + 4);
+            break;
+        default:
+            if (!IsGenericAttribute(index)) {
+                UNREACHABLE();
+                return {};
+            }
+            dest_name =
+                fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+            break;
+        }
+    } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+        const std::string address = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", address, address);
+        dest_name = fmt::format("lmem[{}].x", address);
+    } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+        AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+        ResetTemporaries();
+        return {};
+    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        ResetTemporaries();
+        return {};
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", dest_name, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+            Visit(operation[2]));
+    return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+    // 1.0f in hex, replace with std::bit_cast on C++20
+    static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+    std::string temporary = AllocTemporary();
+    const Node& value = operation[0];
+    const Node& low = operation[1];
+    const Node& high = operation[2];
+    const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+    if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+        AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+    } else {
+        AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+        AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+    }
+    return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+    AddLine("MOV {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+    AddLine("RCP.F32 {}, {};", temporary, temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+    AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+    AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+    AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+    AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+    AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+    AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    const std::string tmp3 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+    AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+    AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+    AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+    AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+    AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+    const std::string operand = Visit(operation[0]);
+    switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+    case Tegra::Shader::HalfType::H0_H1:
+        return operand;
+    case Tegra::Shader::HalfType::F32: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.U {}.x, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H0_H0: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H1_H1: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    }
+    UNREACHABLE();
+    return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string target;
+
+    if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+        ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+        const Tegra::Shader::Pred index = pred->GetIndex();
+        switch (index) {
+        case Tegra::Shader::Pred::NeverExecute:
+        case Tegra::Shader::Pred::UnusedIndex:
+            // Writing to these predicates is a no-op
+            return {};
+        }
+        target = fmt::format("P{}.x", static_cast<u64>(index));
+    } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", target, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+    AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const std::string op = Visit(operation[0]);
+    AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("MOV.S {} (NAN.x), 0;", temporary);
+    AddLine("MOV.S {} (NAN.y), 0;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("MOV.S {} (NAN.x), -1;", temporary);
+    AddLine("MOV.S {} (NAN.y), -1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("IF CF.x;");
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("ENDIF;");
+    return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string_view opcode = "TEX";
+    std::string extra;
+    if (meta.bias) {
+        ASSERT(!meta.lod);
+        opcode = "TXB";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+        } else {
+            const std::string bias = AllocTemporary();
+            AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+            extra = fmt::format(" {},", bias);
+        }
+    }
+    if (meta.lod) {
+        ASSERT(!meta.bias);
+        opcode = "TXL";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+        } else {
+            const std::string lod = AllocTemporary();
+            AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+            extra = fmt::format(" {},", lod);
+        }
+    }
+
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string comp;
+    if (!meta.sampler.is_shadow) {
+        const auto& immediate = std::get<ImmediateNode>(*meta.component);
+        comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+    }
+
+    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+    AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::size_t count = operation.GetOperandsCount();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+    AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+    AddLine("TRUNC.S {}, {};", temporary, temporary);
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    if (!meta.sampler.is_buffer) {
+        ASSERT(swizzle < 4);
+        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+    }
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+            BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const std::string ddx = AllocVectorTemporary();
+    const std::string ddy = AllocVectorTemporary();
+    const std::string coord = BuildCoords(operation).first;
+
+    const std::size_t num_components = meta.derivates.size() / 2;
+    for (std::size_t index = 0; index < num_components; ++index) {
+        const char swizzle = Swizzle(index);
+        AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+        AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+    }
+
+    const std::string_view result = coord;
+    AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+    return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t count = operation.GetOperandsCount();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string temporary = AllocVectorTemporary();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+    AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t num_coords = operation.GetOperandsCount();
+    const std::size_t num_values = meta.values.size();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string coord = AllocVectorTemporary();
+    const std::string value = AllocVectorTemporary();
+    for (std::size_t i = 0; i < num_coords; ++i) {
+        AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+    }
+    for (std::size_t i = 0; i < num_values; ++i) {
+        AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+    }
+    AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+    return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+    const auto target = std::get<ImmediateNode>(*operation[0]);
+    AddLine("MOV.U PC.x, {};", target.GetValue());
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+    AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+    const std::string_view stack_name = StackName(stack);
+    AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+    AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const std::string_view stack_name = StackName(stack);
+    AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+    Exit();
+    return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+    AddLine("KIL TR;");
+    return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+    AddLine("EMIT;");
+    return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+    AddLine("ENDPRIM;");
+    return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+    return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+    LOG_WARNING(Render_OpenGL, "(STUBBED)");
+    const std::string temporary = AllocTemporary();
+    AddLine("MOV.F {}, 1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+    return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        return Visit(operation[0]);
+    }
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+            Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+    if (!ir.IsDecompiled()) {
+        LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
+        return {};
+    }
+    AddLine("BAR;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+    AddLine("MEMBAR.CTA;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+    AddLine("MEMBAR;");
+    return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier) {
+    return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 9964ea894..d9f7b4cc6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,22 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
     : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                            data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,44 +72,21 @@ OGLBufferCache::~OGLBufferCache() {
     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }
 
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a9e86cfc7..59d95adbc 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
@@ -24,57 +23,57 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
 
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
 
-    GLuint GetHandle() const {
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    u64 gpu_address = 0;
 };
 
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
     }
 
 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index a14641b97..208fc6167 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -123,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -170,7 +178,7 @@ bool IsASTCSupported() {
         for (const GLenum format : formats) {
             for (const GLenum support : required_support) {
                 GLint value;
-                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+                glGetInternalformativ(target, format, support, 1, &value);
                 if (value != GL_FULL_SUPPORT) {
                     return false;
                 }
@@ -185,6 +193,7 @@ bool IsASTCSupported() {
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
@@ -208,13 +217,21 @@ Device::Device()
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
     has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
-                           GLAD_GL_NV_compute_program5;
+                           GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
+                           GLAD_GL_NV_transform_feedback2;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
@@ -235,6 +252,7 @@ Device::Device(std::nullptr_t) {
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 98cca0254..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -68,6 +68,14 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -88,6 +96,10 @@ public:
         return has_fast_buffer_sub_data;
     }
 
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
     bool UseAssemblyShaders() const {
         return use_assembly_shaders;
     }
@@ -106,11 +118,14 @@ private:
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
     bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
     bool use_assembly_shaders{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 55e79aaf6..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -60,15 +61,28 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
 constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
     NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                ShaderType shader_type, std::size_t index = 0) {
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
-        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(tex_handle);
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
     }
+
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
     const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -93,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
     return buffer.size;
 }
 
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
@@ -152,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -171,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
@@ -190,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -205,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -227,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -244,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -282,7 +336,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             continue;
         }
 
-        Shader shader{shader_cache.GetStageProgram(program)};
+        Shader* const shader = shader_cache.GetStageProgram(program);
 
         if (device.UseAssemblyShaders()) {
             // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
@@ -576,7 +630,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
     // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = gpu.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }
 
     // Prepare vertex array format.
     SetupVertexFormat();
@@ -593,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -842,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
     static constexpr std::array PARAMETER_LUT = {
         GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
         GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
@@ -872,7 +935,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
     }
 }
 
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
     const auto& entries = kernel->GetEntries();
@@ -906,8 +969,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -920,28 +982,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
 
     const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
     const GPUVAddr gpu_addr = buffer.address;
-    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
 
     if (device.UseAssemblyShaders()) {
         UNIMPLEMENTED_IF(use_unified);
-        if (offset != 0) {
+        if (info.offset != 0) {
             const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-            cbuf = staging_cbuf;
-            offset = 0;
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
         }
-        glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
 
     if (use_unified) {
-        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
     } else {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
 }
 
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
@@ -956,7 +1019,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
     }
 }
 
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -973,13 +1036,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -992,7 +1054,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
     }
 }
 
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
@@ -1021,7 +1083,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
     }
 }
 
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).image;
     for (const auto& entry : shader->GetEntries().images) {
@@ -1031,7 +1093,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
     }
 }
 
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : shader->GetEntries().images) {
@@ -1547,12 +1609,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
     oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }
 
+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = system.GPU().Maxwell3D().regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
+}
+
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
     const auto& regs = system.GPU().Maxwell3D().regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1579,6 +1699,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                           static_cast<GLsizeiptr>(size));
     }
 
+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
     glBeginTransformFeedback(GL_POINTS);
 }
 
@@ -1600,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index f5dc56a0e..4f082592f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
@@ -100,10 +99,10 @@ private:
     void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
 
     /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(const Shader& kernel);
+    void SetupComputeConstBuffers(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
@@ -111,30 +110,30 @@ private:
                           std::size_t unified_offset);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
 
     /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(const Shader& kernel);
+    void SetupComputeGlobalMemory(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
     /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+    void SetupDrawTextures(std::size_t stage_index, Shader* shader);
 
     /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(const Shader& kernel);
+    void SetupComputeTextures(Shader* kernel);
 
     /// Configures a texture.
     void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
                       const SamplerEntry& entry);
 
     /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+    void SetupDrawImages(std::size_t stage_index, Shader* shader);
 
     /// Configures images in a compute shader.
-    void SetupComputeImages(const Shader& shader);
+    void SetupComputeImages(Shader* shader);
 
     /// Configures an image.
     void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -202,6 +201,10 @@ private:
     /// Syncs the framebuffer sRGB state to match the guest state
     void SyncFramebufferSRGB();
 
+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
     /// Begin a transform feedback
     void BeginTransformFeedback(GLenum primitive_mode);
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index a991ca64a..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -147,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u
     auto program = std::make_shared<ProgramHandle>();
 
     if (device.UseAssemblyShaders()) {
-        const std::string arb = "Not implemented";
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
 
         GLuint& arb_prog = program->assembly_program.handle;
 
@@ -194,12 +197,9 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, ProgramSharedPtr program_)
-    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program_)} {
-    // Assign either the assembly program or source program. We can't have both.
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+               ProgramSharedPtr program_)
+    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
     handle = program->assembly_program.handle;
     if (handle == 0) {
         handle = program->source_program.handle;
@@ -207,16 +207,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
     ASSERT(handle != 0);
 }
 
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
 
-GLuint CachedShader::GetHandle() const {
+GLuint Shader::GetHandle() const {
     DEBUG_ASSERT(registry->IsConsistent());
     return handle;
 }
 
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
-                                           Maxwell::ShaderProgram program_type, ProgramCode code,
-                                           ProgramCode code_b) {
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
+                                                      Maxwell::ShaderProgram program_type,
+                                                      ProgramCode code, ProgramCode code_b) {
     const auto shader_type = GetShaderType(program_type);
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
@@ -241,12 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
-                         MakeEntries(params.device, ir, shader_type), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(
+        std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
 }
 
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+                                                       ProgramCode code) {
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
     auto& engine = params.system.GPU().KeplerCompute();
@@ -266,23 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
-                         MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                              MakeEntries(params.device, ir, ShaderType::Compute),
+                                              std::move(program)));
 }
 
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const PrecompiledShader& precompiled_shader,
-                                     std::size_t size_in_bytes) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
-                         precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+                                                const PrecompiledShader& precompiled_shader) {
+    return std::unique_ptr<Shader>(new Shader(
+        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
-      disk_cache{system} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
+      emu_window{emu_window}, device{device}, disk_cache{system} {}
+
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -436,7 +436,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
     return program;
 }
 
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
         return last_shaders[static_cast<std::size_t>(program)];
     }
@@ -446,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     // Look up shader in the cache based on address
     const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
-    Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader};
-    if (shader) {
+    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
         return last_shaders[static_cast<std::size_t>(program)] = shader;
     }
 
@@ -461,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> shader;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
-                                                     std::move(code_b));
+        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        shader = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(shader);
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
-        null_shader = shader;
+        null_shader = std::move(shader);
     }
 
-    return last_shaders[static_cast<std::size_t>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = result;
 }
 
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
     auto& memory_manager{system.GPU().MemoryManager()};
     const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
 
-    auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
-    if (kernel) {
+    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
         return kernel;
     }
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> kernel;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+        kernel = Shader::CreateKernelFromMemory(params, std::move(code));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        kernel = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(kernel);
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
-        null_kernel = kernel;
+        null_kernel = std::move(kernel);
     }
-    return kernel;
+    return result;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index b2ae8d7f9..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -35,12 +35,9 @@ class EmuWindow;
 
 namespace OpenGL {
 
-class CachedShader;
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct ProgramHandle {
@@ -64,62 +61,53 @@ struct ShaderParameters {
     u64 unique_identifier;
 };
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader final {
 public:
-    ~CachedShader();
+    ~Shader();
 
     /// Gets the GL program handle for the shader
     GLuint GetHandle() const;
 
-    /// Returns the size in bytes of the shader
-    std::size_t GetSizeInBytes() const override {
-        return size_in_bytes;
-    }
-
     /// Gets the shader entries for the shader
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
-    static Shader CreateStageFromMemory(const ShaderParameters& params,
-                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode program_code, ProgramCode program_code_b);
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+    static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
+                                                         Maxwell::ShaderProgram program_type,
+                                                         ProgramCode program_code,
+                                                         ProgramCode program_code_b);
+    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+                                                          ProgramCode code);
 
-    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const PrecompiledShader& precompiled_shader,
-                                  std::size_t size_in_bytes);
+    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+                                                   const PrecompiledShader& precompiled_shader);
 
 private:
-    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, ProgramSharedPtr program);
+    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+                    ProgramSharedPtr program);
 
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
-    std::size_t size_in_bytes = 0;
     ProgramSharedPtr program;
     GLuint handle = 0;
 };
 
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
     explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                Core::Frontend::EmuWindow& emu_window, const Device& device);
+    ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
-    Shader GetStageProgram(Maxwell::ShaderProgram program);
+    Shader* GetStageProgram(Maxwell::ShaderProgram program);
 
     /// Gets a compute kernel in the passed address
-    Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const Shader& object) override {}
+    Shader* GetComputeKernel(GPUVAddr code_addr);
 
 private:
     ProgramSharedPtr GeneratePrecompiledProgram(
@@ -132,10 +120,10 @@ private:
     ShaderDiskCacheOpenGL disk_cache;
     std::unordered_map<u64, PrecompiledShader> runtime_cache;
 
-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d6e30b321..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -526,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -909,13 +913,13 @@ private:
                     return "samplerBuffer";
                 }
                 switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
@@ -1380,8 +1384,19 @@ private:
         const std::size_t count = operation.GetOperandsCount();
         const bool has_array = meta->sampler.is_array;
         const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
@@ -1415,6 +1430,16 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -2041,8 +2066,19 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
         if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..653c3f2f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
 
 namespace {
 
+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;
 
 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
     u32 value = 0;
 };
 
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
     u32 cbuf = 0;
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     u32 texture_handler_size_value;
     u32 num_keys;
     u32 num_bound_samplers;
+    u32 num_separate_samplers;
     u32 num_bindless_samplers;
     if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
         file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
         file.ReadArray(&texture_handler_size_value, 1) != 1 ||
         file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
         file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
         file.ReadArray(&num_bindless_samplers, 1) != 1) {
         return false;
     }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     }
 
     std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
     if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
         file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
             flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
         file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
             flat_bindless_samplers.size()) {
         return false;
     }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
     }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
     }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
+    }
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
     }
 
     return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
         file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
         return false;
     }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
     }
 
-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
     flat_bound_samplers.reserve(bound_samplers.size());
     for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
+    }
+
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
     }
 
-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
     flat_bindless_samplers.reserve(bindless_samplers.size());
     for (const auto& [address, sampler] : bindless_samplers) {
         flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
     }
 
     return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
            file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
                flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
            file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
                flat_bindless_samplers.size();
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
     VideoCommon::Shader::ComputeInfo compute_info;
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
     ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
     /*
      * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
      * and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
 
     void Unmap(GLsizeiptr size);
 
+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
+        return buffer_size;
+    }
+
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 57db5a08b..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
     target = GetTextureTarget(params.target);
     texture = CreateTexture(params, target, internal_format, texture_buffer);
     DecorateSurfaceName();
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
-        true);
+
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+
+    main_view =
+        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -413,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
     ASSERT(params.num_levels == 1);
 
+    if (params.target == SurfaceTarget::Texture3D) {
+        if (params.num_layers > 1) {
+            ASSERT(params.base_layer == 0);
+            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+        } else {
+            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+                                   params.base_level, params.base_layer);
+        }
+        return;
+    }
+
     if (params.num_layers > 1) {
-        // Layered framebuffer attachments
         UNIMPLEMENTED_IF(params.base_layer != 0);
-
-        switch (params.target) {
-        case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), 0);
-            break;
-        default:
-            UNIMPLEMENTED();
-        }
+        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
         return;
     }
 
@@ -434,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     const GLuint texture = surface.GetTexture();
     switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture1DArray:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
                                   params.base_layer);
         break;
     default:
@@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
     OGLTextureView texture_view;
     texture_view.Create();
 
-    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
-                  params.num_levels, params.base_layer, params.num_layers);
+    if (target == GL_TEXTURE_3D) {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, 0, 1);
+    } else {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, params.base_layer, params.num_layers);
+    }
     ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
 
     return texture_view;
@@ -544,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                    const Tegra::Engines::Fermi2D::Config& copy_config) {
     const auto& src_params{src_view->GetSurfaceParams()};
     const auto& dst_params{dst_view->GetSurfaceParams()};
-    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
-    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+    UNIMPLEMENTED_IF(src_params.depth != 1);
+    UNIMPLEMENTED_IF(dst_params.depth != 1);
 
     state_tracker.NotifyScissor0();
     state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 8a2ac8603..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,8 +80,10 @@ public:
     explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
     ~CachedSurfaceView();
 
-    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
-    void Attach(GLenum attachment, GLenum target) const;
+    /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+    /// @param attachment   Attachment to bind textures to
+    /// @param fb_target    Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+    void Attach(GLenum attachment, GLenum fb_target) const;
 
     GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                       Tegra::Texture::SwizzleSource y_source,
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..774e70a5b 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -46,12 +47,11 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedInt:
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -70,10 +70,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,46 +84,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +101,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,33 +143,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
         break;
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
         break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +191,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +215,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -277,7 +237,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -341,7 +301,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -361,7 +321,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -392,7 +352,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -423,7 +383,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -434,7 +394,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -447,7 +407,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -486,7 +446,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6214fcbc3..c40adb6e7 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 62e950d31..d7f1ae89f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -21,29 +21,29 @@ namespace Sampler {
 
 VkFilter Filter(Tegra::Texture::TextureFilter filter) {
     switch (filter) {
-    case Tegra::Texture::TextureFilter::Linear:
-        return VK_FILTER_LINEAR;
     case Tegra::Texture::TextureFilter::Nearest:
         return VK_FILTER_NEAREST;
+    case Tegra::Texture::TextureFilter::Linear:
+        return VK_FILTER_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter));
+    UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter));
     return {};
 }
 
 VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) {
     switch (mipmap_filter) {
     case Tegra::Texture::TextureMipmapFilter::None:
-        // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping
-        // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to
-        // use an image view with a single mipmap level to emulate this.
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
-        ;
-    case Tegra::Texture::TextureMipmapFilter::Linear:
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
+        // There are no Vulkan filter modes that directly correspond to OpenGL minification filters
+        // of GL_LINEAR or GL_NEAREST, but they can be emulated using
+        // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter =
+        // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively.
+        return VK_SAMPLER_MIPMAP_MODE_NEAREST;
     case Tegra::Texture::TextureMipmapFilter::Nearest:
         return VK_SAMPLER_MIPMAP_MODE_NEAREST;
+    case Tegra::Texture::TextureMipmapFilter::Linear:
+        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
+    UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
     return {};
 }
 
@@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+    return {};
 }
 
 VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -288,14 +287,35 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
         return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
     case Maxwell::PrimitiveTopology::Patches:
         return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+    return {};
 }
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
     switch (type) {
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return VK_FORMAT_R8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return VK_FORMAT_R8G8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return VK_FORMAT_R8G8B8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return VK_FORMAT_R8G8B8A8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        }
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -316,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
-        default:
-            break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_UNORM;
+            return VK_FORMAT_R8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_UNORM;
+            return VK_FORMAT_R8G8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_UNORM;
+            return VK_FORMAT_R8G8B8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_UNORM;
+            return VK_FORMAT_R8G8B8A8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_UNORM;
+            return VK_FORMAT_R16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_UNORM;
+            return VK_FORMAT_R16G16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_UNORM;
+            return VK_FORMAT_R16G16B16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_UNORM;
+            return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
-            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-        default:
-            break;
+            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SINT;
+            return VK_FORMAT_R8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SINT;
+            return VK_FORMAT_R8G8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SINT;
+            return VK_FORMAT_R8G8B8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SINT;
+            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SINT;
+            return VK_FORMAT_R16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SINT;
+            return VK_FORMAT_R16G16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SINT;
+            return VK_FORMAT_R16G16B16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32:
-            return VK_FORMAT_R32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32:
-            return VK_FORMAT_R32G32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32:
-            return VK_FORMAT_R32G32B32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
-            return VK_FORMAT_R32G32B32A32_SINT;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -400,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_UINT;
-        default:
-            break;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_USCALED;
+            return VK_FORMAT_R8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_USCALED;
+            return VK_FORMAT_R8G8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_USCALED;
+            return VK_FORMAT_R8G8B8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_USCALED;
+            return VK_FORMAT_R8G8B8A8_SINT;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_USCALED;
+            return VK_FORMAT_R16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_USCALED;
+            return VK_FORMAT_R16G16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_USCALED;
+            return VK_FORMAT_R16G16B16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_USCALED;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+            return VK_FORMAT_R32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+            return VK_FORMAT_R32G32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return VK_FORMAT_R32G32B32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return VK_FORMAT_R32G32B32A32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::Float:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SSCALED;
+            return VK_FORMAT_R16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SSCALED;
+            return VK_FORMAT_R16G16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SSCALED;
+            return VK_FORMAT_R16G16B16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SSCALED;
-        default:
-            break;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::Float:
-        switch (size) {
+            return VK_FORMAT_R16G16B16A16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -458,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SFLOAT;
-        default:
-            break;
         }
         break;
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 59b441943..2d9b18ed9 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -13,6 +13,7 @@
 #include <fmt/format.h>
 
 #include "common/dynamic_library.h"
+#include "common/file_util.h"
 #include "common/logging/log.h"
 #include "common/telemetry.h"
 #include "core/core.h"
@@ -76,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() {
     char* libvulkan_env = getenv("LIBVULKAN_PATH");
     if (!libvulkan_env || !library.Open(libvulkan_env)) {
         // Use the libvulkan.dylib from the application bundle.
-        std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
+        const std::string filename =
+            FileUtil::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
         library.Open(filename.c_str());
     }
 #else
@@ -153,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
-    vk::Span<const char*> layers = layers_data;
-    if (!enable_layers) {
-        layers = {};
+    std::vector<const char*> layers;
+    layers.reserve(1);
+    if (enable_layers) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+
+    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+    if (!layer_properties) {
+        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+        layers.clear();
+    }
+
+    for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+        const char* const layer = *layer_it;
+        const auto it = std::find_if(
+            layer_properties->begin(), layer_properties->end(),
+            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+        if (it == layer_properties->end()) {
+            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+            layer_it = layers.erase(layer_it);
+        } else {
+            ++layer_it;
+        }
     }
+
     vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5f33d9e40..f10f96cd8 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                                     VAddr cpu_addr, std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
     VkBufferCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     ci.pNext = nullptr;
@@ -54,46 +54,17 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
     buffer.commit = memory_manager.Commit(buffer.handle, false);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return *empty.handle;
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                    const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
 
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -102,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
         barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
@@ -110,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
     });
 }
 
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                      u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
         barrier.pNext = nullptr;
@@ -123,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
         barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
 
@@ -131,18 +102,20 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
     });
     scheduler.Finish();
 
     std::memcpy(data, staging.commit->Map(size), size);
 }
 
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                              std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
-                      dst_offset, size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer dst_buffer = Handle();
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
+                      size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
         std::array<VkBufferMemoryBarrier, 2> barriers;
@@ -169,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
     });
 }
 
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                             const VKDevice& device, VKMemoryManager& memory_manager,
+                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
+                                                                 CreateStreamBuffer(device,
+                                                                                    scheduler)},
+      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
+                                                                                staging_pool} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+                                    size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+    size = std::max(size, std::size_t(4));
+    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, size, 0);
+    });
+    return {*empty.handle, 0, 0};
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index a54583e7d..3630aca77 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -8,7 +8,6 @@
 
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
@@ -24,22 +23,34 @@ class VKDevice;
 class VKMemoryManager;
 class VKScheduler;
 
-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                               VAddr cpu_addr, std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-    VkBuffer GetHandle() const {
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
+    VkBuffer Handle() const {
         return *buffer.handle;
     }
 
+    u64 Address() const {
+        return 0;
+    }
+
 private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
     VKBuffer buffer;
 };
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
     explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -47,21 +58,10 @@ public:
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
 
-    VkBuffer GetEmptyBuffer(std::size_t size) override;
+    BufferInfo GetEmptyBuffer(std::size_t size) override;
 
 protected:
-    VkBuffer ToHandle(const Buffer& buffer) override;
-
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
 private:
     const VKDevice& device;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index b8ccf164f..ea66e621e 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -27,6 +27,7 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/shader/compiler_settings.h"
 #include "video_core/shader/memory_util.h"
+#include "video_core/shader_cache.h"
 
 namespace Vulkan {
 
@@ -132,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con
     return std::memcmp(&rhs, this, sizeof *this) == 0;
 }
 
-CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage,
-                           GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code,
-                           u32 main_offset)
-    : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)},
+Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+               VideoCommon::Shader::ProgramCode program_code, u32 main_offset)
+    : gpu_addr{gpu_addr}, program_code{std::move(program_code)},
       registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
                                                            compiler_settings, registry},
       entries{GenerateShaderEntries(shader_ir)} {}
 
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
 
-Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine(
-    Core::System& system, Tegra::Engines::ShaderType stage) {
-    if (stage == Tegra::Engines::ShaderType::Compute) {
+Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system,
+                                                              Tegra::Engines::ShaderType stage) {
+    if (stage == ShaderType::Compute) {
         return system.GPU().KeplerCompute();
     } else {
         return system.GPU().Maxwell3D();
@@ -156,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri
                                  VKDescriptorPool& descriptor_pool,
                                  VKUpdateDescriptorQueue& update_descriptor_queue,
                                  VKRenderPassCache& renderpass_cache)
-    : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler},
-      descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue},
-      renderpass_cache{renderpass_cache} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device},
+      scheduler{scheduler}, descriptor_pool{descriptor_pool},
+      update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}
 
 VKPipelineCache::~VKPipelineCache() = default;
 
-std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
+std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
     const auto& gpu = system.GPU().Maxwell3D();
 
-    std::array<Shader, Maxwell::MaxShaderProgram> shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
@@ -178,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
         const GPUVAddr program_addr{GetShaderAddress(system, program)};
         const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
         ASSERT(cpu_addr);
-        auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
-        if (!shader) {
+
+        Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
+        if (!result) {
             const auto host_ptr{memory_manager.GetPointer(program_addr)};
 
             // No shader found - create a new one
             constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
-            const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1);
+            const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
             ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
+            const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+            auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code),
+                                                   stage_offset);
+            result = shader.get();
 
-            shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr,
-                                                    std::move(code), stage_offset);
             if (cpu_addr) {
-                Register(shader);
+                Register(std::move(shader), *cpu_addr, size_in_bytes);
             } else {
-                null_shader = shader;
+                null_shader = std::move(shader);
             }
         }
-        shaders[index] = std::move(shader);
+        shaders[index] = result;
     }
     return last_shaders = shaders;
 }
@@ -236,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
     ASSERT(cpu_addr);
 
-    auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
+    Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
     if (!shader) {
         // No shader found - create a new one
         const auto host_ptr = memory_manager.GetPointer(program_addr);
 
         ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
-        shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute,
-                                                program_addr, *cpu_addr, std::move(code),
-                                                KERNEL_MAIN_OFFSET);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+        auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr,
+                                                    std::move(code), KERNEL_MAIN_OFFSET);
+        shader = shader_info.get();
+
         if (cpu_addr) {
-            Register(shader);
+            Register(std::move(shader_info), *cpu_addr, size_in_bytes);
         } else {
-            null_kernel = shader;
+            null_kernel = std::move(shader_info);
         }
     }
 
@@ -264,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     return *entry;
 }
 
-void VKPipelineCache::Unregister(const Shader& shader) {
+void VKPipelineCache::OnShaderRemoval(Shader* shader) {
     bool finished = false;
     const auto Finish = [&] {
         // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and
@@ -296,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) {
         Finish();
         it = compute_cache.erase(it);
     }
-
-    RasterizerCache::Unregister(shader);
 }
 
 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
@@ -332,12 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
         }
 
         const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
-        ASSERT(shader);
+        const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
 
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
-        const auto program_type = GetShaderType(program_enum);
+        const ShaderType program_type = GetShaderType(program_enum);
         const auto& entries = shader->GetEntries();
         program[stage] = {
             Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 0b5796fef..0a36e5112 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -17,7 +17,6 @@
 #include "common/common_types.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
@@ -26,6 +25,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -41,8 +41,6 @@ class VKFence;
 class VKScheduler;
 class VKUpdateDescriptorQueue;
 
-class CachedShader;
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct GraphicsPipelineCacheKey {
@@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
 
 namespace Vulkan {
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader {
 public:
-    explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
-                          VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code,
-                          u32 main_offset);
-    ~CachedShader();
+    explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+                    VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
+    ~Shader();
 
     GPUVAddr GetGpuAddr() const {
         return gpu_addr;
     }
 
-    std::size_t GetSizeInBytes() const override {
-        return program_code.size() * sizeof(u64);
-    }
-
     VideoCommon::Shader::ShaderIR& GetIR() {
         return shader_ir;
     }
@@ -144,25 +137,23 @@ private:
     ShaderEntries entries;
 };
 
-class VKPipelineCache final : public RasterizerCache<Shader> {
+class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
 public:
     explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
                              const VKDevice& device, VKScheduler& scheduler,
                              VKDescriptorPool& descriptor_pool,
                              VKUpdateDescriptorQueue& update_descriptor_queue,
                              VKRenderPassCache& renderpass_cache);
-    ~VKPipelineCache();
+    ~VKPipelineCache() override;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> GetShaders();
+    std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();
 
     VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);
 
     VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);
 
 protected:
-    void Unregister(const Shader& shader) override;
-
-    void FlushObjectInner(const Shader& object) override {}
+    void OnShaderRemoval(Shader* shader) final;
 
 private:
     std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
@@ -175,10 +166,10 @@ private:
     VKUpdateDescriptorQueue& update_descriptor_queue;
     VKRenderPassCache& renderpass_cache;
 
-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 
     GraphicsPipelineCacheKey last_graphics_key;
     VKGraphicsPipeline* last_graphics_pipeline = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index d86c46412..a8d94eac3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -38,6 +38,7 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader_cache.h"
 
 namespace Vulkan {
 
@@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {
 }
 
 std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;
     for (std::size_t i = 0; i < std::size(addresses); ++i) {
         addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0;
@@ -117,6 +118,17 @@ template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                std::size_t stage, std::size_t index = 0) {
     const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
         const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
         return engine.GetTextureInfo(tex_handle);
@@ -131,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
     }
 }
 
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+    if (!is_clear) {
+        return true;
+    }
+    // First we have to make sure all clear masks are enabled.
+    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+        !regs.clear_buffers.A) {
+        return true;
+    }
+    // If scissors are disabled, the whole screen is cleared
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Then we have to confirm scissor testing clears the whole image
+    const std::size_t index = regs.clear_buffers.RT;
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+           scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+    // If we are not clearing, the contents have to be preserved
+    if (!is_clear) {
+        return true;
+    }
+    // For depth stencil clears we only have to confirm scissor test covers the whole image
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Make sure the clear cover the whole image
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+           scissor.max_y < regs.zeta_height;
+}
+
 } // Anonymous namespace
 
 class BufferBindings final {
@@ -332,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     buffer_cache.Unmap();
 
-    const Texceptions texceptions = UpdateAttachments();
+    const Texceptions texceptions = UpdateAttachments(false);
     SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
 
     key.renderpass_params = GetRenderPassParams(texceptions);
@@ -388,7 +443,7 @@ void RasterizerVulkan::Clear() {
         return;
     }
 
-    [[maybe_unused]] const auto texceptions = UpdateAttachments();
+    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
     DEBUG_ASSERT(texceptions.none());
     SetupImageTransitions(0, color_attachments, zeta_attachment);
 
@@ -665,9 +720,12 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    auto& dirty = maxwell3d.dirty.flags;
+    auto& regs = maxwell3d.regs;
+
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -676,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     Texceptions texceptions;
     for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
         if (update_rendertargets) {
-            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
             texceptions[rt] = true;
@@ -684,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     }
 
     if (update_rendertargets) {
-        zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
         texceptions[ZETA_TEXCEPTION_INDEX] = true;
@@ -716,7 +776,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
         if (!view) {
             return false;
         }
-        key.views.push_back(view->GetHandle());
+        key.views.push_back(view->GetAttachment());
         key.width = std::min(key.width, view->GetWidth());
         key.height = std::min(key.height, view->GetHeight());
         key.layers = std::min(key.layers, view->GetNumLayers());
@@ -776,12 +836,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
 }
 
 void RasterizerVulkan::SetupShaderDescriptors(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     texture_cache.GuardSamplers(true);
 
     for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
         // Skip VertexA stage
-        const auto& shader = shaders[stage + 1];
+        Shader* const shader = shaders[stage + 1];
         if (!shader) {
             continue;
         }
@@ -858,10 +918,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     UNIMPLEMENTED_IF(binding.buffer_offset != 0);
 
     const GPUVAddr gpu_addr = binding.Address();
-    const std::size_t size = binding.buffer_size;
-    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
 
-    scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
         cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
         cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
     });
@@ -913,8 +973,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
             buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
             continue;
         }
-        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
-        buffer_bindings.AddVertexBinding(buffer, offset);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        buffer_bindings.AddVertexBinding(info.handle, info.offset);
     }
 }
 
@@ -936,7 +996,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
         std::tie(buffer, offset) = quad_indexed_pass.Assemble(
             regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
 
@@ -950,7 +1012,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
 
         auto format = regs.index_array.format;
         const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
@@ -1097,10 +1161,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
         Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
     ASSERT(size <= MaxConstbufferSize);
 
-    const auto [buffer_handle, offset] =
+    const auto info =
         buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
-    update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
@@ -1114,14 +1177,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
         // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
         // default buffer.
         static constexpr std::size_t dummy_size = 4;
-        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
-        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+        const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
         return;
     }
 
-    const auto [buffer, offset] = buffer_cache.UploadMemory(
+    const auto info = buffer_cache.UploadMemory(
         actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
-    update_descriptor_queue.AddBuffer(buffer, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
@@ -1137,12 +1200,12 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
     auto view = texture_cache.GetTextureSurface(texture.tic, entry);
     ASSERT(!view->IsBufferView());
 
-    const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source,
-                                            texture.tic.z_source, texture.tic.w_source);
+    const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
+                                                      texture.tic.z_source, texture.tic.w_source);
     const auto sampler = sampler_cache.GetSampler(texture.tsc);
     update_descriptor_queue.AddSampledImage(sampler, image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     sampled_views.push_back(ImageView{std::move(view), image_layout});
 }
@@ -1164,10 +1227,11 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
 
     UNIMPLEMENTED_IF(tic.IsBuffer());
 
-    const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    const VkImageView image_view =
+        view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
     update_descriptor_queue.AddImage(image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_GENERAL;
     image_views.push_back(ImageView{std::move(view), image_layout});
 }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 04be37a5e..83e00e7e9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -159,7 +159,10 @@ private:
 
     void FlushWork();
 
-    Texceptions UpdateAttachments();
+    /// @brief Updates the currently bound attachments
+    /// @param is_clear True when the framebuffer is updated as a clear
+    /// @return Bitfield of attachments being used as sampled textures
+    Texceptions UpdateAttachments(bool is_clear);
 
     std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
 
@@ -168,7 +171,7 @@ private:
                                  bool is_indexed, bool is_instanced);
 
     /// Setup descriptors in the graphics pipeline.
-    void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders);
+    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
 
     void SetupImageTransitions(Texceptions texceptions,
                                const std::array<View, Maxwell::NumRenderTargets>& color_attachments,
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index e6f2fa553..616eacc36 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -9,6 +9,8 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/textures/texture.h"
 
+using Tegra::Texture::TextureMipmapFilter;
+
 namespace Vulkan {
 
 namespace {
@@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c
     ci.maxAnisotropy = tsc.GetMaxAnisotropy();
     ci.compareEnable = tsc.depth_compare_enabled;
     ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func);
-    ci.minLod = tsc.GetMinLod();
-    ci.maxLod = tsc.GetMaxLod();
+    ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod();
+    ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod();
     ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color);
     ci.unnormalizedCoordinates = VK_FALSE;
     return device.GetLogical().CreateSampler(ci);
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 82ec9180e..56524e6f3 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
 }
 
 void VKScheduler::WorkerThread() {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     std::unique_lock lock{mutex};
     do {
         cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index dfddf7ad6..689f0d276 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,10 +35,14 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const noexcept {
         return *buffer;
     }
 
+    u64 Address() const noexcept {
+        return 0;
+    }
+
 private:
     struct Watch final {
         VKFenceWatch fence;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index ea487b770..430031665 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
         ci.extent = {params.width, params.height, 1};
         break;
     case SurfaceTarget::Texture3D:
+        ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
         ci.extent = {params.width, params.height, params.depth};
         break;
     case SurfaceTarget::TextureBuffer:
@@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
     return ci;
 }
 
+u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
+                  Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace
 
 CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
@@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
     }
 
     // TODO(Rodrigo): Move this to a virtual function.
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
-        true);
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+    main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {
 }
 
 View CachedSurface::CreateView(const ViewParams& params) {
-    return CreateViewInner(params, false);
-}
-
-View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
     // TODO(Rodrigo): Add name decorations
-    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
 }
 
 void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
 }
 
 CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                                     const ViewParams& params, bool is_proxy)
+                                     const ViewParams& params)
     : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
       image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
       aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
-      base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
-      num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
-                                                           : VK_IMAGE_VIEW_TYPE_1D} {}
+      base_level{params.base_level}, num_levels{params.num_levels},
+      image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        base_layer = 0;
+        num_layers = 1;
+        base_slice = params.base_layer;
+        num_slices = params.num_layers;
+    } else {
+        base_layer = params.base_layer;
+        num_layers = params.num_layers;
+    }
+}
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
-                                         SwizzleSource z_source, SwizzleSource w_source) {
+VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
+                                            SwizzleSource z_source, SwizzleSource w_source) {
     const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
     if (last_image_view && last_swizzle == new_swizzle) {
         return last_image_view;
@@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
             });
     }
 
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ASSERT(base_slice == 0);
+        ASSERT(num_slices == params.depth);
+    }
+
     VkImageViewCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
     ci.pNext = nullptr;
@@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
     return last_image_view = *image_view;
 }
 
+VkImageView CachedSurfaceView::GetAttachment() {
+    if (render_target) {
+        return *render_target;
+    }
+
+    VkImageViewCreateInfo ci;
+    ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    ci.pNext = nullptr;
+    ci.flags = 0;
+    ci.image = surface.GetImageHandle();
+    ci.format = surface.GetImage().GetFormat();
+    ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
+                     VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
+    ci.subresourceRange.aspectMask = aspect_mask;
+    ci.subresourceRange.baseMipLevel = base_level;
+    ci.subresourceRange.levelCount = num_levels;
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
+        ci.subresourceRange.baseArrayLayer = base_slice;
+        ci.subresourceRange.layerCount = num_slices;
+    } else {
+        ci.viewType = image_view_type;
+        ci.subresourceRange.baseArrayLayer = base_layer;
+        ci.subresourceRange.layerCount = num_layers;
+    }
+    render_target = device.GetLogical().CreateImageView(ci);
+    return *render_target;
+}
+
 VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                                const VKDevice& device, VKResourceManager& resource_manager,
                                VKMemoryManager& memory_manager, VKScheduler& scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f211ccb1e..807e26c8a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -91,7 +91,6 @@ protected:
     void DecorateSurfaceName();
 
     View CreateView(const ViewParams& params) override;
-    View CreateViewInner(const ViewParams& params, bool is_proxy);
 
 private:
     void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -120,23 +119,20 @@ private:
 class CachedSurfaceView final : public VideoCommon::ViewBase {
 public:
     explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                               const ViewParams& params, bool is_proxy);
+                               const ViewParams& params);
     ~CachedSurfaceView();
 
-    VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
-                          Tegra::Texture::SwizzleSource y_source,
-                          Tegra::Texture::SwizzleSource z_source,
-                          Tegra::Texture::SwizzleSource w_source);
+    VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
+                             Tegra::Texture::SwizzleSource y_source,
+                             Tegra::Texture::SwizzleSource z_source,
+                             Tegra::Texture::SwizzleSource w_source);
+
+    VkImageView GetAttachment();
 
     bool IsSameSurface(const CachedSurfaceView& rhs) const {
         return &surface == &rhs.surface;
     }
 
-    VkImageView GetHandle() {
-        return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
-                         Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
-    }
-
     u32 GetWidth() const {
         return params.GetMipWidth(base_level);
     }
@@ -180,14 +176,6 @@ public:
     }
 
 private:
-    static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                             Tegra::Texture::SwizzleSource y_source,
-                             Tegra::Texture::SwizzleSource z_source,
-                             Tegra::Texture::SwizzleSource w_source) {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
     // Store a copy of these values to avoid double dereference when reading them
     const SurfaceParams params;
     const VkImage image;
@@ -196,15 +184,18 @@ private:
 
     const VKDevice& device;
     CachedSurface& surface;
-    const u32 base_layer;
-    const u32 num_layers;
     const u32 base_level;
     const u32 num_levels;
     const VkImageViewType image_view_type;
+    u32 base_layer = 0;
+    u32 num_layers = 0;
+    u32 base_slice = 0;
+    u32 num_slices = 0;
 
     VkImageView last_image_view = nullptr;
     u32 last_swizzle = 0;
 
+    vk::ImageView render_target;
     std::unordered_map<u32, vk::ImageView> view_cache;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 681ecde98..351c048d2 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {
 }
 
 void VKUpdateDescriptorQueue::Acquire() {
-    entries.clear();
-}
+    // Minimum number of entries required.
+    // This is the maximum number of entries a single draw call migth use.
+    static constexpr std::size_t MIN_ENTRIES = 0x400;
 
-void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
-                                   VkDescriptorSet set) {
-    if (payload.size() + entries.size() >= payload.max_size()) {
+    if (payload.size() + MIN_ENTRIES >= payload.max_size()) {
         LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
         scheduler.WaitWorker();
         payload.clear();
     }
+    upload_start = &*payload.end();
+}
 
-    // TODO(Rodrigo): Rework to write the payload directly
-    const auto payload_start = payload.data() + payload.size();
-    for (const auto& entry : entries) {
-        if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) {
-            payload.push_back(*image);
-        } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) {
-            payload.push_back(*buffer);
-        } else if (const auto texel = std::get_if<VkBufferView>(&entry)) {
-            payload.push_back(*texel);
-        } else {
-            UNREACHABLE();
-        }
-    }
-
-    scheduler.Record(
-        [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) {
-            logical->UpdateDescriptorSet(set, update_template, payload_start);
-        });
+void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
+                                   VkDescriptorSet set) {
+    const void* const data = upload_start;
+    const vk::Device* const logical = &device.GetLogical();
+    scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) {
+        logical->UpdateDescriptorSet(set, update_template, data);
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h
index cc7e3dff4..945320c72 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -15,17 +15,13 @@ namespace Vulkan {
 class VKDevice;
 class VKScheduler;
 
-class DescriptorUpdateEntry {
-public:
-    explicit DescriptorUpdateEntry() {}
-
-    DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {}
+struct DescriptorUpdateEntry {
+    DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {}
 
-    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {}
+    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {}
 
-    DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {}
+    DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {}
 
-private:
     union {
         VkDescriptorImageInfo image;
         VkDescriptorBufferInfo buffer;
@@ -45,32 +41,34 @@ public:
     void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);
 
     void AddSampledImage(VkSampler sampler, VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
     }
 
     void AddImage(VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
     }
 
     void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) {
-        entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
+        payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
     }
 
     void AddTexelBuffer(VkBufferView texel_buffer) {
-        entries.emplace_back(texel_buffer);
+        payload.emplace_back(texel_buffer);
     }
 
-    VkImageLayout* GetLastImageLayout() {
-        return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout;
+    VkImageLayout* LastImageLayout() {
+        return &payload.back().image.imageLayout;
     }
 
-private:
-    using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>;
+    const VkImageLayout* LastImageLayout() const {
+        return &payload.back().image.imageLayout;
+    }
 
+private:
     const VKDevice& device;
     VKScheduler& scheduler;
 
-    boost::container::static_vector<Variant, 0x400> entries;
+    const DescriptorUpdateEntry* upload_start = nullptr;
     boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;
 };
 
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 2ce9b0626..0d485a662 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
 
 bool Load(InstanceDispatch& dld) noexcept {
 #define X(name) Proc(dld.name, dld, #name)
-    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+           X(vkEnumerateInstanceLayerProperties);
 #undef X
 }
 
@@ -725,8 +726,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s
     return supported == VK_TRUE;
 }
 
-VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const
-    noexcept {
+VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {
     VkSurfaceCapabilitiesKHR capabilities;
     Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));
     return capabilities;
@@ -771,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
     return properties;
 }
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld) {
+    u32 num;
+    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    std::vector<VkLayerProperties> properties(num);
+    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    return properties;
+}
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index 98937a77a..d56fdb3f9 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
     PFN_vkCreateInstance vkCreateInstance;
     PFN_vkDestroyInstance vkDestroyInstance;
     PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
 
     PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
     PFN_vkCreateDevice vkCreateDevice;
@@ -779,7 +780,7 @@ public:
 
     bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const;
 
-    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept;
+    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;
 
     std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const;
 
@@ -996,4 +997,7 @@ private:
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld);
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz == 0) {
-        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    PredCondition cond;
+    bool bf;
+    bool ftz;
+    bool neg_a;
+    bool abs_a;
+    bool neg_b;
+    bool abs_b;
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_C:
+    case OpCode::Id::HSET2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        bf = instr.Bit(53);
+        ftz = instr.Bit(54);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(56);
+        abs_b = instr.Bit(54);
+        break;
+    case OpCode::Id::HSET2_R:
+        cond = instr.hsetp2.reg.cond;
+        bf = instr.Bit(49);
+        ftz = instr.Bit(50);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(31);
+        abs_b = instr.Bit(30);
+        break;
+    default:
+        UNREACHABLE();
     }
 
-    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
-    Node op_b = [&]() {
+    Node op_b = [this, instr, opcode] {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::HSET2_C:
+            // Inform as unimplemented as this is not tested.
+            UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         case OpCode::Id::HSET2_R:
             return GetRegister(instr.gpr20);
+        case OpCode::Id::HSET2_IMM:
+            return UnpackHalfImmediate(instr, true);
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return Node{};
         }
     }();
-    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
-    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
-    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+    if (!ftz) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_R:
+        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+        [[fallthrough]];
+    case OpCode::Id::HSET2_C:
+        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+        break;
+    default:
+        break;
+    }
 
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
     // HSET2 operates on each half float in the pack.
     std::array<Node, 2> values;
     for (u32 i = 0; i < 2; ++i) {
-        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
-        const Node true_value = Immediate(raw_value << (i * 16));
-        const Node false_value = Immediate(0);
-
-        const Node comparison =
-            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
-        const Node predicate = Operation(combiner, comparison, second_pred);
+        const u32 raw_value = bf ? 0x3c00 : 0xffff;
+        Node true_value = Immediate(raw_value << (i * 16));
+        Node false_value = Immediate(0);
 
+        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+        Node predicate = Operation(combiner, comparison, second_pred);
         values[i] =
-            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
     }
 
-    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
-    SetRegister(bb, instr.gpr0, value);
+    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+    SetRegister(bb, instr.gpr0, move(value));
 
     return pc;
 }
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 60b6ad72a..07778dc3e 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         break;
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         if (component == 0) {
             return descriptor.b_type;
         }
@@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         }
         break;
     }
-    UNIMPLEMENTED_MSG("texture format not implement={}", format);
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
 }
 
@@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 6;
         }
         return 0;
+    case TextureFormat::BF10GF11RF11:
+        if (component == 1 || component == 2) {
+            return 11;
+        }
+        if (component == 0) {
+            return 10;
+        }
+        return 0;
     case TextureFormat::G8R24:
         if (component == 0) {
             return 8;
@@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return 0;
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return 0;
 }
 
 std::size_t GetImageComponentMask(TextureFormat format) {
@@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R32_B24G8:
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         return std::size_t{R | G | B};
     case TextureFormat::R32_G32:
     case TextureFormat::R16_G16:
@@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R8:
     case TextureFormat::R1:
         return std::size_t{R};
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return std::size_t{R | G | B | A};
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return std::size_t{R | G | B | A};
 }
 
 std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
             return {std::move(original_value), true};
         }
     default:
-        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
         return {std::move(original_value), true};
     }
 }
@@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
             default:
                 break;
             }
-            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
                               static_cast<u64>(instr.suatom_d.operation.Value()),
                               static_cast<u64>(instr.suatom_d.operation_type.Value()));
             return OperationCode::AtomicImageAdd;
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 8f0bb996e..29ebf65ba 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
-                                               std::optional<u32> buffer) {
+ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
+    SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
     if (info.IsComplete()) {
         return info;
     }
-    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
-                                : registry.ObtainBoundSampler(offset);
     if (!sampler) {
         LOG_WARNING(HW_GPU, "Unknown sampler info");
         info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
@@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
 
 std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
                                             SamplerInfo sampler_info) {
-    const auto offset = static_cast<u32>(sampler.index.Value());
-    const auto info = GetSamplerInfo(sampler_info, offset);
+    const u32 offset = static_cast<u32>(sampler.index.Value());
+    const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
 
     // If this sampler has already been used, return the existing mapping.
     const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
@@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
     const Node sampler_register = GetRegister(reg);
     const auto [base_node, tracked_sampler_info] =
         TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT(base_node != nullptr);
-    if (base_node == nullptr) {
+    if (!base_node) {
+        UNREACHABLE();
         return std::nullopt;
     }
 
-    if (const auto bindless_sampler_info =
-            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
-        const u32 buffer = bindless_sampler_info->GetIndex();
-        const u32 offset = bindless_sampler_info->GetOffset();
-        info = GetSamplerInfo(info, offset, buffer);
+    if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+        const u32 buffer = sampler_info->index;
+        const u32 offset = sampler_info->offset;
+        info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
-                                     [buffer = buffer, offset = offset](const Sampler& entry) {
+                                     [buffer, offset](const Sampler& entry) {
                                          return entry.buffer == buffer && entry.offset == offset;
                                      });
         if (it != used_samplers.end()) {
@@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
         return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
                                           *info.is_shadow, *info.is_buffer, false);
     }
-    if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
-        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
-        index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
-        info = GetSamplerInfo(info, base_offset);
+    if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
+        const std::pair indices = sampler_info->indices;
+        const std::pair offsets = sampler_info->offsets;
+        info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
+
+        // Try to use an already created sampler if it exists
+        const auto it = std::find_if(
+            used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
+                return offsets == std::pair{entry.offset, entry.secondary_offset} &&
+                       indices == std::pair{entry.buffer, entry.secondary_buffer};
+            });
+        if (it != used_samplers.end()) {
+            ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+            return *it;
+        }
+
+        // Otherwise create a new mapping for this sampler
+        const u32 next_index = static_cast<u32>(used_samplers.size());
+        return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer);
+    }
+    if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+        const u32 base_offset = sampler_info->base_offset / 4;
+        index_var = GetCustomVariable(sampler_info->bindless_var);
+        info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
index 074f21691..5071c83ca 100644
--- a/src/video_core/shader/memory_util.cpp
+++ b/src/video_core/shader/memory_util.cpp
@@ -66,12 +66,12 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_add
 
 u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
                         const ProgramCode& code_b) {
-    u64 unique_identifier = boost::hash_value(code);
+    size_t unique_identifier = boost::hash_value(code);
     if (is_a) {
         // VertexA programs include two programs
         boost::hash_combine(unique_identifier, boost::hash_value(code_b));
     }
-    return unique_identifier;
+    return static_cast<u64>(unique_identifier);
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c5e5165ff..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
 
-class BindlessSamplerNode;
-class ArraySamplerNode;
+struct ArraySamplerNode;
+struct BindlessSamplerNode;
+struct SeparateSamplerNode;
 
-using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
 using TrackSampler = std::shared_ptr<TrackSamplerData>;
 
 struct Sampler {
@@ -288,63 +289,51 @@ struct Sampler {
         : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
           is_buffer{is_buffer}, is_indexed{is_indexed} {}
 
+    /// Separate sampler constructor
+    constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
+                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
+                               bool is_buffer)
+        : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
+          buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
+          is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
+
     /// Bindless samplers constructor
     constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
                                bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
           is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
 
-    u32 index = 0;  ///< Emulated index given for the this sampler.
-    u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
-    u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size = 1;   ///< Size of the sampler.
+    u32 index = 0;            ///< Emulated index given for the this sampler.
+    u32 offset = 0;           ///< Offset in the const buffer from where the sampler is being read.
+    u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
+    u32 buffer = 0;           ///< Buffer where the bindless sampler is read.
+    u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
+    u32 size = 1;             ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array = false;    ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow = false;   ///< Whether the texture is being sampled as a depth texture or not.
-    bool is_buffer = false;   ///< Whether the texture is a texture buffer without sampler.
-    bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
-    bool is_indexed = false;  ///< Whether this sampler is an indexed array of textures.
+    bool is_array = false;     ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow = false;    ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_buffer = false;    ///< Whether the texture is a texture buffer without sampler.
+    bool is_bindless = false;  ///< Whether this sampler belongs to a bindless texture or not.
+    bool is_indexed = false;   ///< Whether this sampler is an indexed array of textures.
+    bool is_separated = false; ///< Whether the image and sampler is separated or not.
 };
 
 /// Represents a tracked bindless sampler into a direct const buffer
-class ArraySamplerNode final {
-public:
-    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
-        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetBaseOffset() const {
-        return base_offset;
-    }
-
-    constexpr u32 GetIndexVar() const {
-        return bindless_var;
-    }
-
-private:
+struct ArraySamplerNode {
     u32 index;
     u32 base_offset;
     u32 bindless_var;
 };
 
-/// Represents a tracked bindless sampler into a direct const buffer
-class BindlessSamplerNode final {
-public:
-    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
+/// Represents a tracked separate sampler image pair that was folded statically
+struct SeparateSamplerNode {
+    std::pair<u32, u32> indices;
+    std::pair<u32, u32> offsets;
+};
 
-private:
+/// Represents a tracked bindless sampler into a direct const buffer
+struct BindlessSamplerNode {
     u32 index;
     u32 offset;
 };
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
 template <typename T, typename... Args>
 TrackSampler MakeTrackSampler(Args&&... args) {
     static_assert(std::is_convertible_v<T, TrackSamplerData>);
-    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+    return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
 }
 
 template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..cdf274e54 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
     return value;
 }
 
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
+    std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
+    SeparateSamplerKey key;
+    key.buffers = buffers;
+    key.offsets = offsets;
+    const auto iter = separate_samplers.find(key);
+    if (iter != separate_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+
+    const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
+    const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
+    const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
+    separate_samplers.emplace(key, value);
+    return value;
+}
+
 std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
                                                                                  u32 offset) {
     const std::pair key = {buffer, offset};
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..231206765 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
 
 namespace VideoCommon::Shader {
 
+struct SeparateSamplerKey {
+    std::pair<u32, u32> buffers;
+    std::pair<u32, u32> offsets;
+};
+
+} // namespace VideoCommon::Shader
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::Shader::SeparateSamplerKey> {
+    std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
+        return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
+                                key.offsets.second);
+    }
+};
+
+template <>
+struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
+    bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
+                    const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
+        return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
+    }
+};
+
+} // namespace std
+
+namespace VideoCommon::Shader {
+
 using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
 using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using SeparateSamplerMap =
+    std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
 using BindlessSamplerMap =
     std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
 
@@ -73,6 +104,9 @@ public:
 
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
 
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
+        std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
+
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
 
     /// Inserts a key.
@@ -128,6 +162,7 @@ private:
     Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
     KeyMap keys;
     BoundSamplerMap bound_samplers;
+    SeparateSamplerMap separate_samplers;
     BindlessSamplerMap bindless_samplers;
     u32 bound_buffer;
     GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 15ae152f2..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -330,8 +330,8 @@ private:
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
 
     /// Queries the missing sampler info from the execution context.
-    SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset,
-                               std::optional<u32> buffer = std::nullopt);
+    SamplerInfo GetSamplerInfo(SamplerInfo info,
+                               std::optional<Tegra::Engines::SamplerDescriptor> sampler);
 
     /// Accesses a texture sampler.
     std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
@@ -409,8 +409,14 @@ private:
 
     std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                        s64 cursor);
+    std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                       s64 cursor);
+
+    std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
+                                                             const OperationNode& operation,
+                                                             Node gpr, Node base_offset,
+                                                             Node tracked, const NodeBlock& code,
+                                                             s64 cursor);
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index eb97bfd41..d5ed81442 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
 namespace VideoCommon::Shader {
 
 namespace {
+
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
@@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
     if (const auto operation = std::get_if<OperationNode>(&*node)) {
         operation->SetAmendIndex(amend_index);
         return true;
-    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+    }
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
         conditional->SetAmendIndex(amend_index);
         return true;
     }
@@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
 
 } // Anonymous namespace
 
-std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                              s64 cursor) {
+std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                             s64 cursor) {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+        const u32 cbuf_index = cbuf->GetIndex();
+
         // Constant buffer found, test if it's an immediate
         const auto& offset = cbuf->GetOffset();
         if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
-            auto track =
-                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+            auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
             return {tracked, track};
         }
         if (const auto operation = std::get_if<OperationNode>(&*offset)) {
             const u32 bound_buffer = registry.GetBoundBuffer();
-            if (bound_buffer != cbuf->GetIndex()) {
+            if (bound_buffer != cbuf_index) {
                 return {};
             }
-            const auto pair = DecoupleIndirectRead(*operation);
-            if (!pair) {
-                return {};
+            if (const std::optional pair = DecoupleIndirectRead(*operation)) {
+                auto [gpr, base_offset] = *pair;
+                return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
+                                                  code, cursor);
             }
-            auto [gpr, base_offset] = *pair;
-            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            const auto& gpu_driver = registry.AccessGuestDriverProfile();
-            const u32 bindless_cv = NewCustomVariable();
-            Node op =
-                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
-
-            const Node cv_node = GetCustomVariable(bindless_cv);
-            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
-            const std::size_t amend_index = DeclareAmend(std::move(amend_op));
-            AmendNodeCv(amend_index, code[cursor]);
-            // TODO Implement Bindless Index custom variable
-            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
-                                                            offset_inm->GetValue(), bindless_cv);
-            return {tracked, track};
         }
         return {};
     }
@@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
         return TrackBindlessSampler(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
-            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
-                std::get<0>(found)) {
-                // Cbuf found in operand.
+        const OperationNode& op = *operation;
+
+        const OperationCode opcode = operation->GetCode();
+        if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
+            ASSERT(op.GetOperandsCount() == 2);
+            auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
+            auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
+            if (node_a && node_b) {
+                auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
+                                                                   std::pair{offset_a, offset_b});
+                return {tracked, std::move(track)};
+            }
+        }
+        std::size_t i = op.GetOperandsCount();
+        while (i--) {
+            if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
+                // Constant buffer found in operand.
                 return found;
             }
         }
@@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
     return {};
 }
 
+std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
+    const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
+    const NodeBlock& code, s64 cursor) {
+    const auto offset_imm = std::get<ImmediateNode>(*base_offset);
+    const auto& gpu_driver = registry.AccessGuestDriverProfile();
+    const u32 bindless_cv = NewCustomVariable();
+    const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
+    Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
+
+    Node cv_node = GetCustomVariable(bindless_cv);
+    Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
+    const std::size_t amend_index = DeclareAmend(std::move(amend_op));
+    AmendNodeCv(amend_index, code[cursor]);
+
+    // TODO: Implement bindless index custom variable
+    auto track =
+        MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
+    return {tracked, track};
+}
+
 std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
                                                s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
new file mode 100644
index 000000000..2dd270e99
--- /dev/null
+++ b/src/video_core/shader_cache.h
@@ -0,0 +1,228 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class T>
+class ShaderCache {
+    static constexpr u64 PAGE_BITS = 14;
+
+    struct Entry {
+        VAddr addr_start;
+        VAddr addr_end;
+        T* data;
+
+        bool is_memory_marked = true;
+
+        constexpr bool Overlaps(VAddr start, VAddr end) const noexcept {
+            return start < addr_end && addr_start < end;
+        }
+    };
+
+public:
+    virtual ~ShaderCache() = default;
+
+    /// @brief Removes shaders inside a given region
+    /// @note Checks for ranges
+    /// @param addr Start address of the invalidation
+    /// @param size Number of bytes of the invalidation
+    void InvalidateRegion(VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+        RemovePendingShaders();
+    }
+
+    /// @brief Unmarks a memory region as cached and marks it for removal
+    /// @param addr Start address of the CPU write operation
+    /// @param size Number of bytes of the CPU write operation
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+    }
+
+    /// @brief Flushes delayed removal operations
+    void SyncGuestHost() {
+        std::scoped_lock lock{invalidation_mutex};
+        RemovePendingShaders();
+    }
+
+    /// @brief Tries to obtain a cached shader starting in a given address
+    /// @note Doesn't check for ranges, the given address has to be the start of the shader
+    /// @param addr Start address of the shader, this doesn't cache for region
+    /// @return Pointer to a valid shader, nullptr when nothing is found
+    T* TryGet(VAddr addr) const {
+        std::scoped_lock lock{lookup_mutex};
+
+        const auto it = lookup_cache.find(addr);
+        if (it == lookup_cache.end()) {
+            return nullptr;
+        }
+        return it->second->data;
+    }
+
+protected:
+    explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {}
+
+    /// @brief Register in the cache a given entry
+    /// @param data Shader to store in the cache
+    /// @param addr Start address of the shader that will be registered
+    /// @param size Size in bytes of the shader
+    void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex, lookup_mutex};
+
+        const VAddr addr_end = addr + size;
+        Entry* const entry = NewEntry(addr, addr_end, data.get());
+
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+            invalidation_cache[page].push_back(entry);
+        }
+
+        storage.push_back(std::move(data));
+
+        rasterizer.UpdatePagesCachedCount(addr, size, 1);
+    }
+
+    /// @brief Called when a shader is going to be removed
+    /// @param shader Shader that will be removed
+    /// @pre invalidation_cache is locked
+    /// @pre lookup_mutex is locked
+    virtual void OnShaderRemoval([[maybe_unused]] T* shader) {}
+
+private:
+    /// @brief Invalidate pages in a given region
+    /// @pre invalidation_mutex is locked
+    void InvalidatePagesInRegion(VAddr addr, std::size_t size) {
+        const VAddr addr_end = addr + size;
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+            const auto it = invalidation_cache.find(page);
+            if (it == invalidation_cache.end()) {
+                continue;
+            }
+
+            std::vector<Entry*>& entries = it->second;
+            InvalidatePageEntries(entries, addr, addr_end);
+
+            // If there's nothing else in this page, remove it to avoid overpopulating the hash map.
+            if (entries.empty()) {
+                invalidation_cache.erase(it);
+            }
+        }
+    }
+
+    /// @brief Remove shaders marked for deletion
+    /// @pre invalidation_mutex is locked
+    void RemovePendingShaders() {
+        if (marked_for_removal.empty()) {
+            return;
+        }
+        std::scoped_lock lock{lookup_mutex};
+
+        std::vector<T*> removed_shaders;
+        removed_shaders.reserve(marked_for_removal.size());
+
+        for (Entry* const entry : marked_for_removal) {
+            if (lookup_cache.erase(entry->addr_start) > 0) {
+                removed_shaders.push_back(entry->data);
+            }
+        }
+        marked_for_removal.clear();
+
+        if (!removed_shaders.empty()) {
+            RemoveShadersFromStorage(std::move(removed_shaders));
+        }
+    }
+
+    /// @brief Invalidates entries in a given range for the passed page
+    /// @param entries         Vector of entries in the page, it will be modified on overlaps
+    /// @param addr            Start address of the invalidation
+    /// @param addr_end        Non-inclusive end address of the invalidation
+    /// @pre invalidation_mutex is locked
+    void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
+        auto it = entries.begin();
+        while (it != entries.end()) {
+            Entry* const entry = *it;
+            if (!entry->Overlaps(addr, addr_end)) {
+                ++it;
+                continue;
+            }
+            UnmarkMemory(entry);
+            marked_for_removal.push_back(entry);
+
+            it = entries.erase(it);
+        }
+    }
+
+    /// @brief Unmarks an entry from the rasterizer cache
+    /// @param entry Entry to unmark from memory
+    void UnmarkMemory(Entry* entry) {
+        if (!entry->is_memory_marked) {
+            return;
+        }
+        entry->is_memory_marked = false;
+
+        const VAddr addr = entry->addr_start;
+        const std::size_t size = entry->addr_end - addr;
+        rasterizer.UpdatePagesCachedCount(addr, size, -1);
+    }
+
+    /// @brief Removes a vector of shaders from a list
+    /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates
+    /// @pre invalidation_mutex is locked
+    /// @pre lookup_mutex is locked
+    void RemoveShadersFromStorage(std::vector<T*> removed_shaders) {
+        // Remove duplicates
+        std::sort(removed_shaders.begin(), removed_shaders.end());
+        removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()),
+                              removed_shaders.end());
+
+        // Now that there are no duplicates, we can notify removals
+        for (T* const shader : removed_shaders) {
+            OnShaderRemoval(shader);
+        }
+
+        // Remove them from the cache
+        const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) {
+            return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) !=
+                   removed_shaders.end();
+        };
+        storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end());
+    }
+
+    /// @brief Creates a new entry in the lookup cache and returns its pointer
+    /// @pre lookup_mutex is locked
+    Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) {
+        auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
+        Entry* const entry_pointer = entry.get();
+
+        lookup_cache.emplace(addr, std::move(entry));
+        return entry_pointer;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+
+    mutable std::mutex lookup_mutex;
+    std::mutex invalidation_mutex;
+
+    std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache;
+    std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
+    std::vector<std::unique_ptr<T>> storage;
+    std::vector<Entry*> marked_for_removal;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 715f39d0d..0caf3b4f0 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     }
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
+    if (layer >= params.depth) {
+        return {};
+    }
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
@@ -248,12 +251,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
 
     // Use an extra temporal buffer
     auto& tmp_buffer = staging_cache.GetBuffer(1);
-    // Special case for 3D Texture Segments
-    const bool must_read_current_data =
-        params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
     tmp_buffer.resize(guest_memory_size);
     host_ptr = tmp_buffer.data();
-    if (must_read_current_data) {
+
+    if (params.target == SurfaceTarget::Texture3D) {
+        // Special case for 3D texture segments
         memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
     }
 
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 79e10ffbb..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -217,8 +217,8 @@ public:
     }
 
     bool IsProtected() const {
-        // Only 3D Slices are to be protected
-        return is_target && params.block_depth > 0;
+        // Only 3D slices are to be protected
+        return is_target && params.target == SurfaceTarget::Texture3D;
     }
 
     bool IsRenderTarget() const {
@@ -250,6 +250,11 @@ public:
         return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
     }
 
+    TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
+        return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
+                                  base_level, num_levels));
+    }
+
     std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
                                               const GPUVAddr view_addr,
                                               const std::size_t candidate_size, const u32 mipmap,
@@ -272,8 +277,8 @@ public:
     std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
                                      const std::size_t candidate_size) {
         if (params.target == SurfaceTarget::Texture3D ||
-            (params.num_levels == 1 && !params.is_layered) ||
-            view_params.target == SurfaceTarget::Texture3D) {
+            view_params.target == SurfaceTarget::Texture3D ||
+            (params.num_levels == 1 && !params.is_layered)) {
             return {};
         }
         const auto layer_mipmap{GetLayerMipmap(view_addr)};
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 884fabffe..0b2b2b8c4 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
     params.num_levels = 1;
     params.emulated_levels = 1;
 
-    const bool is_layered = config.layers > 1 && params.block_depth == 0;
-    params.is_layered = is_layered;
-    params.depth = is_layered ? config.layers.Value() : 1;
-    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+    if (config.memory_layout.is_3d != 0) {
+        params.depth = config.layers.Value();
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture3D;
+    } else if (config.layers > 1) {
+        params.depth = config.layers.Value();
+        params.is_layered = true;
+        params.target = SurfaceTarget::Texture2DArray;
+    } else {
+        params.depth = 1;
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture2D;
+    }
     return params;
 }
 
@@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
     params.width = config.width;
     params.height = config.height;
     params.pitch = config.pitch;
-    // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
+    // TODO(Rodrigo): Try to guess texture arrays from parameters
     params.target = SurfaceTarget::Texture2D;
     params.depth = 1;
     params.num_levels = 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 6f63217a2..6207d8dfe 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -24,6 +24,7 @@
 #include "core/core.h"
 #include "core/memory.h"
 #include "core/settings.h"
+#include "video_core/compatible_formats.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -47,8 +48,8 @@ class RasterizerInterface;
 
 namespace VideoCommon {
 
+using VideoCore::Surface::FormatCompatibility;
 using VideoCore::Surface::PixelFormat;
-
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
@@ -298,15 +299,13 @@ public:
         const GPUVAddr src_gpu_addr = src_config.Address();
         const GPUVAddr dst_gpu_addr = dst_config.Address();
         DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
-        const std::optional<VAddr> dst_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr);
-        const std::optional<VAddr> src_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
-        std::pair<TSurface, TView> dst_surface =
-            GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
-        std::pair<TSurface, TView> src_surface =
-            GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
-        ImageBlit(src_surface.second, dst_surface.second, copy_config);
+
+        const auto& memory_manager = system.GPU().MemoryManager();
+        const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
+        const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
+        std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+        TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
+        ImageBlit(src_surface, dst_surface.second, copy_config);
         dst_surface.first->MarkAsModified(true, Tick());
     }
 
@@ -508,12 +507,12 @@ private:
             return RecycleStrategy::Flush;
         }
         // 3D Textures decision
-        if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) {
+        if (params.target == SurfaceTarget::Texture3D) {
             return RecycleStrategy::Flush;
         }
         for (const auto& s : overlaps) {
             const auto& s_params = s->GetSurfaceParams();
-            if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) {
+            if (s_params.target == SurfaceTarget::Texture3D) {
                 return RecycleStrategy::Flush;
             }
         }
@@ -597,7 +596,7 @@ private:
         } else {
             new_surface = GetUncachedSurface(gpu_addr, params);
         }
-        const auto& final_params = new_surface->GetSurfaceParams();
+        const SurfaceParams& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
             if (Settings::IsGPULevelExtreme()) {
                 BufferCopy(current_surface, new_surface);
@@ -605,7 +604,7 @@ private:
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
-                ImageCopy(current_surface, new_surface, brick);
+                TryCopyImage(current_surface, new_surface, brick);
             }
         }
         Unregister(current_surface);
@@ -696,7 +695,7 @@ private:
                 }
                 const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
                                              src_params.depth);
-                ImageCopy(surface, new_surface, copy_params);
+                TryCopyImage(surface, new_surface, copy_params);
             }
         }
         if (passed_tests == 0) {
@@ -731,51 +730,9 @@ private:
      */
     std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
                                                                const SurfaceParams& params,
-                                                               const GPUVAddr gpu_addr,
-                                                               const VAddr cpu_addr,
+                                                               GPUVAddr gpu_addr, VAddr cpu_addr,
                                                                bool preserve_contents) {
-        if (params.target == SurfaceTarget::Texture3D) {
-            bool failed = false;
-            if (params.num_levels > 1) {
-                // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
-                return std::nullopt;
-            }
-            TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-            bool modified = false;
-            for (auto& surface : overlaps) {
-                const SurfaceParams& src_params = surface->GetSurfaceParams();
-                if (src_params.target != SurfaceTarget::Texture2D) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.height != params.height) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.block_depth != params.block_depth ||
-                    src_params.block_height != params.block_height) {
-                    failed = true;
-                    break;
-                }
-                const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
-                const auto offsets = params.GetBlockOffsetXYZ(offset);
-                const auto z = std::get<2>(offsets);
-                modified |= surface->IsModified();
-                const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
-                                             1);
-                ImageCopy(surface, new_surface, copy_params);
-            }
-            if (failed) {
-                return std::nullopt;
-            }
-            for (const auto& surface : overlaps) {
-                Unregister(surface);
-            }
-            new_surface->MarkAsModified(modified, Tick());
-            Register(new_surface);
-            auto view = new_surface->GetMainView();
-            return {{std::move(new_surface), view}};
-        } else {
+        if (params.target != SurfaceTarget::Texture3D) {
             for (const auto& surface : overlaps) {
                 if (!surface->MatchTarget(params.target)) {
                     if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
@@ -791,11 +748,60 @@ private:
                     continue;
                 }
                 if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
-                    return {{surface, surface->GetMainView()}};
+                    return std::make_pair(surface, surface->GetMainView());
                 }
             }
             return InitializeSurface(gpu_addr, params, preserve_contents);
         }
+
+        if (params.num_levels > 1) {
+            // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
+            return std::nullopt;
+        }
+
+        if (overlaps.size() == 1) {
+            const auto& surface = overlaps[0];
+            const SurfaceParams& overlap_params = surface->GetSurfaceParams();
+            // Don't attempt to render to textures with more than one level for now
+            // The texture has to be to the right or the sample address if we want to render to it
+            if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
+                const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
+                const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+                if (slice < overlap_params.depth) {
+                    auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
+                    return std::make_pair(std::move(surface), std::move(view));
+                }
+            }
+        }
+
+        TSurface new_surface = GetUncachedSurface(gpu_addr, params);
+        bool modified = false;
+
+        for (auto& surface : overlaps) {
+            const SurfaceParams& src_params = surface->GetSurfaceParams();
+            if (src_params.target != SurfaceTarget::Texture2D ||
+                src_params.height != params.height ||
+                src_params.block_depth != params.block_depth ||
+                src_params.block_height != params.block_height) {
+                return std::nullopt;
+            }
+            modified |= surface->IsModified();
+
+            const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
+            const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+            const u32 width = params.width;
+            const u32 height = params.height;
+            const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
+            TryCopyImage(surface, new_surface, copy_params);
+        }
+        for (const auto& surface : overlaps) {
+            Unregister(surface);
+        }
+        new_surface->MarkAsModified(modified, Tick());
+        Register(new_surface);
+
+        TView view = new_surface->GetMainView();
+        return std::make_pair(std::move(new_surface), std::move(view));
     }
 
     /**
@@ -873,7 +879,7 @@ private:
             }
         }
 
-        // Check if it's a 3D texture
+        // Manage 3D textures
         if (params.block_depth > 0) {
             auto surface =
                 Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
@@ -1048,7 +1054,7 @@ private:
     void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,
                         const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {
         auto deduced_src = DeduceSurface(src_gpu_addr, src_params);
-        auto deduced_dst = DeduceSurface(src_gpu_addr, src_params);
+        auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);
         if (deduced_src.Failed() || deduced_dst.Failed()) {
             return;
         }
@@ -1187,6 +1193,19 @@ private:
         return {};
     }
 
+    /// Try to do an image copy logging when formats are incompatible.
+    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+        const SurfaceParams& src_params = src->GetSurfaceParams();
+        const SurfaceParams& dst_params = dst->GetSurfaceParams();
+        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+                      static_cast<int>(dst_params.pixel_format),
+                      static_cast<int>(src_params.pixel_format));
+            return;
+        }
+        ImageCopy(src, dst, copy);
+    }
+
     constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
         return siblings_table[static_cast<std::size_t>(format)];
     }
@@ -1236,6 +1255,7 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
 
     FormatLookupTable format_lookup_table;
+    FormatCompatibility format_compatibility;
 
     u64 ticks{};