28 files changed, 461 insertions, 423 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 2442ddfd6..4408b5001 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,6 +12,10 @@
 #include <utility>
 #include <vector>
 
+#include <boost/icl/interval_map.hpp>
+#include <boost/icl/interval_set.hpp>
+#include <boost/range/iterator_range.hpp>
+
 #include "common/alignment.h"
 #include "common/common_types.h"
 #include "core/core.h"
@@ -30,7 +34,7 @@ public:
     using BufferInfo = std::pair<const TBufferType*, u64>;
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                            bool is_written = false) {
+                            bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
         auto& memory_manager = system.GPU().MemoryManager();
@@ -43,9 +47,13 @@ public:
         // Cache management is a big overhead, so only cache entries with a given size.
         // TODO: Figure out which size is the best for given games.
         constexpr std::size_t max_stream_size = 0x800;
-        if (size < max_stream_size) {
+        if (use_fast_cbuf || size < max_stream_size) {
             if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
-                return StreamBufferUpload(host_ptr, size, alignment);
+                if (use_fast_cbuf) {
+                    return ConstBufferUpload(host_ptr, size);
+                } else {
+                    return StreamBufferUpload(host_ptr, size, alignment);
+                }
             }
         }
 
@@ -152,6 +160,10 @@ protected:
     virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
                            std::size_t dst_offset, std::size_t size) = 0;
 
+    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
+        return {};
+    }
+
     /// Register an object into the cache
     void Register(const MapInterval& new_map, bool inherit_written = false) {
         const CacheAddr cache_ptr = new_map->GetStart();
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 7ff44f06d..85d308e26 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -28,6 +28,13 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
     }
 }
 
+std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
+    const u32 line_a = src_2 - src_1;
+    const u32 line_b = dst_2 - dst_1;
+    const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
+    return {line_b - (excess * line_b) / line_a, excess};
+}
+
 void Fermi2D::HandleSurfaceCopy() {
     LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
               static_cast<u32>(regs.operation));
@@ -47,10 +54,27 @@ void Fermi2D::HandleSurfaceCopy() {
         src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
         src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
     }
+    u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width;
+    u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height;
+    const auto [new_dst_w, src_excess_x] =
+        DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width);
+    const auto [new_dst_h, src_excess_y] =
+        DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height);
+    dst_blit_x2 = new_dst_w + regs.blit_dst_x;
+    src_blit_x2 = src_blit_x2 - src_excess_x;
+    dst_blit_y2 = new_dst_h + regs.blit_dst_y;
+    src_blit_y2 = src_blit_y2 - src_excess_y;
+    const auto [new_src_w, dst_excess_x] =
+        DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width);
+    const auto [new_src_h, dst_excess_y] =
+        DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height);
+    src_blit_x2 = new_src_w + src_blit_x1;
+    dst_blit_x2 = dst_blit_x2 - dst_excess_x;
+    src_blit_y2 = new_src_h + src_blit_y1;
+    dst_blit_y2 = dst_blit_y2 - dst_excess_y;
     const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
-    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
-                                          regs.blit_dst_x + regs.blit_dst_width,
-                                          regs.blit_dst_y + regs.blit_dst_height};
+    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
+                                          dst_blit_y2};
     Config copy_config;
     copy_config.operation = regs.operation;
     copy_config.filter = regs.blit_control.filter;
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 0901cf2fa..dba342c70 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -99,19 +99,19 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0x80);
+                INSERT_UNION_PADDING_WORDS(0x80);
 
                 Surface dst;
 
-                INSERT_PADDING_WORDS(2);
+                INSERT_UNION_PADDING_WORDS(2);
 
                 Surface src;
 
-                INSERT_PADDING_WORDS(0x15);
+                INSERT_UNION_PADDING_WORDS(0x15);
 
                 Operation operation;
 
-                INSERT_PADDING_WORDS(0x177);
+                INSERT_UNION_PADDING_WORDS(0x177);
 
                 union {
                     u32 raw;
@@ -119,7 +119,7 @@ public:
                     BitField<4, 1, Filter> filter;
                 } blit_control;
 
-                INSERT_PADDING_WORDS(0x8);
+                INSERT_UNION_PADDING_WORDS(0x8);
 
                 u32 blit_dst_x;
                 u32 blit_dst_y;
@@ -130,7 +130,7 @@ public:
                 u64 blit_src_x;
                 u64 blit_src_y;
 
-                INSERT_PADDING_WORDS(0x21);
+                INSERT_UNION_PADDING_WORDS(0x21);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index b185c98c7..5259d92bd 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -51,7 +51,7 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0x60);
+                INSERT_UNION_PADDING_WORDS(0x60);
 
                 Upload::Registers upload;
 
@@ -63,7 +63,7 @@ public:
 
                 u32 data_upload;
 
-                INSERT_PADDING_WORDS(0x3F);
+                INSERT_UNION_PADDING_WORDS(0x3F);
 
                 struct {
                     u32 address;
@@ -72,11 +72,11 @@ public:
                     }
                 } launch_desc_loc;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 u32 launch;
 
-                INSERT_PADDING_WORDS(0x4A7);
+                INSERT_UNION_PADDING_WORDS(0x4A7);
 
                 struct {
                     u32 address_high;
@@ -88,7 +88,7 @@ public:
                     }
                 } tsc;
 
-                INSERT_PADDING_WORDS(0x3);
+                INSERT_UNION_PADDING_WORDS(0x3);
 
                 struct {
                     u32 address_high;
@@ -100,7 +100,7 @@ public:
                     }
                 } tic;
 
-                INSERT_PADDING_WORDS(0x22);
+                INSERT_UNION_PADDING_WORDS(0x22);
 
                 struct {
                     u32 address_high;
@@ -111,11 +111,11 @@ public:
                     }
                 } code_loc;
 
-                INSERT_PADDING_WORDS(0x3FE);
+                INSERT_UNION_PADDING_WORDS(0x3FE);
 
                 u32 tex_cb_index;
 
-                INSERT_PADDING_WORDS(0x374);
+                INSERT_UNION_PADDING_WORDS(0x374);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
@@ -179,7 +179,7 @@ public:
         };
 
         INSERT_PADDING_WORDS(0x11);
-    } launch_description;
+    } launch_description{};
 
     struct {
         u32 write_offset = 0;
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index e0e25c321..396fb6e86 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -45,7 +45,7 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0x60);
+                INSERT_UNION_PADDING_WORDS(0x60);
 
                 Upload::Registers upload;
 
@@ -57,7 +57,7 @@ public:
 
                 u32 data;
 
-                INSERT_PADDING_WORDS(0x11);
+                INSERT_UNION_PADDING_WORDS(0x11);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 8cc842684..1aa7c274f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -496,7 +496,7 @@ public:
             Equation equation_a;
             Factor factor_source_a;
             Factor factor_dest_a;
-            INSERT_PADDING_WORDS(1);
+            INSERT_UNION_PADDING_WORDS(1);
         };
 
         struct RenderTargetConfig {
@@ -517,7 +517,7 @@ public:
             };
             u32 layer_stride;
             u32 base_layer;
-            INSERT_PADDING_WORDS(7);
+            INSERT_UNION_PADDING_WORDS(7);
 
             GPUVAddr Address() const {
                 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
@@ -542,7 +542,7 @@ public:
             f32 translate_x;
             f32 translate_y;
             f32 translate_z;
-            INSERT_PADDING_WORDS(2);
+            INSERT_UNION_PADDING_WORDS(2);
 
             Common::Rectangle<s32> GetRect() const {
                 return {
@@ -606,7 +606,7 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0x45);
+                INSERT_UNION_PADDING_WORDS(0x45);
 
                 struct {
                     u32 upload_address;
@@ -615,7 +615,7 @@ public:
                     u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x17);
+                INSERT_UNION_PADDING_WORDS(0x17);
 
                 Upload::Registers upload;
                 struct {
@@ -626,7 +626,7 @@ public:
 
                 u32 data_upload;
 
-                INSERT_PADDING_WORDS(0x44);
+                INSERT_UNION_PADDING_WORDS(0x44);
 
                 struct {
                     union {
@@ -636,11 +636,11 @@ public:
                     };
                 } sync_info;
 
-                INSERT_PADDING_WORDS(0x11E);
+                INSERT_UNION_PADDING_WORDS(0x11E);
 
                 u32 tfb_enabled;
 
-                INSERT_PADDING_WORDS(0x2E);
+                INSERT_UNION_PADDING_WORDS(0x2E);
 
                 std::array<RenderTargetConfig, NumRenderTargets> rt;
 
@@ -648,49 +648,49 @@ public:
 
                 std::array<ViewPort, NumViewports> viewports;
 
-                INSERT_PADDING_WORDS(0x1D);
+                INSERT_UNION_PADDING_WORDS(0x1D);
 
                 struct {
                     u32 first;
                     u32 count;
                 } vertex_buffer;
 
-                INSERT_PADDING_WORDS(1);
+                INSERT_UNION_PADDING_WORDS(1);
 
                 float clear_color[4];
                 float clear_depth;
 
-                INSERT_PADDING_WORDS(0x3);
+                INSERT_UNION_PADDING_WORDS(0x3);
 
                 s32 clear_stencil;
 
-                INSERT_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x7);
 
                 u32 polygon_offset_point_enable;
                 u32 polygon_offset_line_enable;
                 u32 polygon_offset_fill_enable;
 
-                INSERT_PADDING_WORDS(0xD);
+                INSERT_UNION_PADDING_WORDS(0xD);
 
                 std::array<ScissorTest, NumViewports> scissor_test;
 
-                INSERT_PADDING_WORDS(0x15);
+                INSERT_UNION_PADDING_WORDS(0x15);
 
                 s32 stencil_back_func_ref;
                 u32 stencil_back_mask;
                 u32 stencil_back_func_mask;
 
-                INSERT_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0xC);
 
                 u32 color_mask_common;
 
-                INSERT_PADDING_WORDS(0x6);
+                INSERT_UNION_PADDING_WORDS(0x6);
 
                 u32 rt_separate_frag_data;
 
                 f32 depth_bounds[2];
 
-                INSERT_PADDING_WORDS(0xA);
+                INSERT_UNION_PADDING_WORDS(0xA);
 
                 struct {
                     u32 address_high;
@@ -710,7 +710,7 @@ public:
                     }
                 } zeta;
 
-                INSERT_PADDING_WORDS(0x41);
+                INSERT_UNION_PADDING_WORDS(0x41);
 
                 union {
                     BitField<0, 4, u32> stencil;
@@ -719,11 +719,11 @@ public:
                     BitField<12, 4, u32> viewport;
                 } clear_flags;
 
-                INSERT_PADDING_WORDS(0x19);
+                INSERT_UNION_PADDING_WORDS(0x19);
 
                 std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
 
-                INSERT_PADDING_WORDS(0xF);
+                INSERT_UNION_PADDING_WORDS(0xF);
 
                 struct {
                     union {
@@ -746,16 +746,16 @@ public:
                     }
                 } rt_control;
 
-                INSERT_PADDING_WORDS(0x2);
+                INSERT_UNION_PADDING_WORDS(0x2);
 
                 u32 zeta_width;
                 u32 zeta_height;
 
-                INSERT_PADDING_WORDS(0x27);
+                INSERT_UNION_PADDING_WORDS(0x27);
 
                 u32 depth_test_enable;
 
-                INSERT_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x5);
 
                 u32 independent_blend_enable;
 
@@ -763,7 +763,7 @@ public:
 
                 u32 alpha_test_enabled;
 
-                INSERT_PADDING_WORDS(0x6);
+                INSERT_UNION_PADDING_WORDS(0x6);
 
                 u32 d3d_cull_mode;
 
@@ -777,7 +777,7 @@ public:
                     float b;
                     float a;
                 } blend_color;
-                INSERT_PADDING_WORDS(0x4);
+                INSERT_UNION_PADDING_WORDS(0x4);
 
                 struct {
                     u32 separate_alpha;
@@ -786,7 +786,7 @@ public:
                     Blend::Factor factor_dest_rgb;
                     Blend::Equation equation_a;
                     Blend::Factor factor_source_a;
-                    INSERT_PADDING_WORDS(1);
+                    INSERT_UNION_PADDING_WORDS(1);
                     Blend::Factor factor_dest_a;
 
                     u32 enable_common;
@@ -802,7 +802,7 @@ public:
                 u32 stencil_front_func_mask;
                 u32 stencil_front_mask;
 
-                INSERT_PADDING_WORDS(0x2);
+                INSERT_UNION_PADDING_WORDS(0x2);
 
                 u32 frag_color_clamp;
 
@@ -811,12 +811,12 @@ public:
                     BitField<4, 1, u32> triangle_rast_flip;
                 } screen_y_control;
 
-                INSERT_PADDING_WORDS(0x21);
+                INSERT_UNION_PADDING_WORDS(0x21);
 
                 u32 vb_element_base;
                 u32 vb_base_instance;
 
-                INSERT_PADDING_WORDS(0x35);
+                INSERT_UNION_PADDING_WORDS(0x35);
 
                 union {
                     BitField<0, 1, u32> c0;
@@ -829,11 +829,11 @@ public:
                     BitField<7, 1, u32> c7;
                 } clip_distance_enabled;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 float point_size;
 
-                INSERT_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x7);
 
                 u32 zeta_enable;
 
@@ -842,7 +842,7 @@ public:
                     BitField<4, 1, u32> alpha_to_one;
                 } multisample_control;
 
-                INSERT_PADDING_WORDS(0x4);
+                INSERT_UNION_PADDING_WORDS(0x4);
 
                 struct {
                     u32 address_high;
@@ -866,11 +866,11 @@ public:
                     }
                 } tsc;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 float polygon_offset_factor;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 struct {
                     u32 tic_address_high;
@@ -883,7 +883,7 @@ public:
                     }
                 } tic;
 
-                INSERT_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x5);
 
                 u32 stencil_two_side_enable;
                 StencilOp stencil_back_op_fail;
@@ -891,13 +891,13 @@ public:
                 StencilOp stencil_back_op_zpass;
                 ComparisonOp stencil_back_func_func;
 
-                INSERT_PADDING_WORDS(0x4);
+                INSERT_UNION_PADDING_WORDS(0x4);
 
                 u32 framebuffer_srgb;
 
                 float polygon_offset_units;
 
-                INSERT_PADDING_WORDS(0x11);
+                INSERT_UNION_PADDING_WORDS(0x11);
 
                 union {
                     BitField<2, 1, u32> coord_origin;
@@ -913,7 +913,7 @@ public:
                             (static_cast<GPUVAddr>(code_address_high) << 32) | code_address_low);
                     }
                 } code_address;
-                INSERT_PADDING_WORDS(1);
+                INSERT_UNION_PADDING_WORDS(1);
 
                 struct {
                     u32 vertex_end_gl;
@@ -925,14 +925,14 @@ public:
                     };
                 } draw;
 
-                INSERT_PADDING_WORDS(0xA);
+                INSERT_UNION_PADDING_WORDS(0xA);
 
                 struct {
                     u32 enabled;
                     u32 index;
                 } primitive_restart;
 
-                INSERT_PADDING_WORDS(0x5F);
+                INSERT_UNION_PADDING_WORDS(0x5F);
 
                 struct {
                     u32 start_addr_high;
@@ -973,9 +973,9 @@ public:
                     }
                 } index_array;
 
-                INSERT_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x7);
 
-                INSERT_PADDING_WORDS(0x1F);
+                INSERT_UNION_PADDING_WORDS(0x1F);
 
                 float polygon_offset_clamp;
 
@@ -989,17 +989,17 @@ public:
                     }
                 } instanced_arrays;
 
-                INSERT_PADDING_WORDS(0x6);
+                INSERT_UNION_PADDING_WORDS(0x6);
 
                 Cull cull;
 
                 u32 pixel_center_integer;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 u32 viewport_transform_enabled;
 
-                INSERT_PADDING_WORDS(0x3);
+                INSERT_UNION_PADDING_WORDS(0x3);
 
                 union {
                     BitField<0, 1, u32> depth_range_0_1;
@@ -1007,13 +1007,13 @@ public:
                     BitField<4, 1, u32> depth_clamp_far;
                 } view_volume_clip_control;
 
-                INSERT_PADDING_WORDS(0x21);
+                INSERT_UNION_PADDING_WORDS(0x21);
                 struct {
                     u32 enable;
                     LogicOperation operation;
                 } logic_op;
 
-                INSERT_PADDING_WORDS(0x1);
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 union {
                     u32 raw;
@@ -1026,9 +1026,9 @@ public:
                     BitField<6, 4, u32> RT;
                     BitField<10, 11, u32> layer;
                 } clear_buffers;
-                INSERT_PADDING_WORDS(0xB);
+                INSERT_UNION_PADDING_WORDS(0xB);
                 std::array<ColorMask, NumRenderTargets> color_mask;
-                INSERT_PADDING_WORDS(0x38);
+                INSERT_UNION_PADDING_WORDS(0x38);
 
                 struct {
                     u32 query_address_high;
@@ -1050,7 +1050,7 @@ public:
                     }
                 } query;
 
-                INSERT_PADDING_WORDS(0x3C);
+                INSERT_UNION_PADDING_WORDS(0x3C);
 
                 struct {
                     union {
@@ -1090,10 +1090,10 @@ public:
                         BitField<4, 4, ShaderProgram> program;
                     };
                     u32 offset;
-                    INSERT_PADDING_WORDS(14);
+                    INSERT_UNION_PADDING_WORDS(14);
                 } shader_config[MaxShaderProgram];
 
-                INSERT_PADDING_WORDS(0x60);
+                INSERT_UNION_PADDING_WORDS(0x60);
 
                 u32 firmware[0x20];
 
@@ -1110,7 +1110,7 @@ public:
                     }
                 } const_buffer;
 
-                INSERT_PADDING_WORDS(0x10);
+                INSERT_UNION_PADDING_WORDS(0x10);
 
                 struct {
                     union {
@@ -1118,14 +1118,14 @@ public:
                         BitField<0, 1, u32> valid;
                         BitField<4, 5, u32> index;
                     };
-                    INSERT_PADDING_WORDS(7);
+                    INSERT_UNION_PADDING_WORDS(7);
                 } cb_bind[MaxShaderStage];
 
-                INSERT_PADDING_WORDS(0x56);
+                INSERT_UNION_PADDING_WORDS(0x56);
 
                 u32 tex_cb_index;
 
-                INSERT_PADDING_WORDS(0x395);
+                INSERT_UNION_PADDING_WORDS(0x395);
 
                 struct {
                     /// Compressed address of a buffer that holds information about bound SSBOs.
@@ -1137,14 +1137,14 @@ public:
                     }
                 } ssbo_info;
 
-                INSERT_PADDING_WORDS(0x11);
+                INSERT_UNION_PADDING_WORDS(0x11);
 
                 struct {
                     u32 address[MaxShaderStage];
                     u32 size[MaxShaderStage];
                 } tex_info_buffers;
 
-                INSERT_PADDING_WORDS(0xCC);
+                INSERT_UNION_PADDING_WORDS(0xCC);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 93808a9bb..4f40d1d1f 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -94,7 +94,7 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0xC0);
+                INSERT_UNION_PADDING_WORDS(0xC0);
 
                 struct {
                     union {
@@ -112,7 +112,7 @@ public:
                     };
                 } exec;
 
-                INSERT_PADDING_WORDS(0x3F);
+                INSERT_UNION_PADDING_WORDS(0x3F);
 
                 struct {
                     u32 address_high;
@@ -139,7 +139,7 @@ public:
                 u32 x_count;
                 u32 y_count;
 
-                INSERT_PADDING_WORDS(0xB8);
+                INSERT_UNION_PADDING_WORDS(0xB8);
 
                 u32 const0;
                 u32 const1;
@@ -162,11 +162,11 @@ public:
 
                 Parameters dst_params;
 
-                INSERT_PADDING_WORDS(1);
+                INSERT_UNION_PADDING_WORDS(1);
 
                 Parameters src_params;
 
-                INSERT_PADDING_WORDS(0x13);
+                INSERT_UNION_PADDING_WORDS(0x13);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h
index e86a7f04a..bc80661d8 100644
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -38,37 +38,37 @@ struct Header {
         BitField<26, 1, u32> does_load_or_store;
         BitField<27, 1, u32> does_fp64;
         BitField<28, 4, u32> stream_out_mask;
-    } common0;
+    } common0{};
 
     union {
         BitField<0, 24, u32> shader_local_memory_low_size;
         BitField<24, 8, u32> per_patch_attribute_count;
-    } common1;
+    } common1{};
 
     union {
         BitField<0, 24, u32> shader_local_memory_high_size;
         BitField<24, 8, u32> threads_per_input_primitive;
-    } common2;
+    } common2{};
 
     union {
         BitField<0, 24, u32> shader_local_memory_crs_size;
         BitField<24, 4, OutputTopology> output_topology;
         BitField<28, 4, u32> reserved;
-    } common3;
+    } common3{};
 
     union {
         BitField<0, 12, u32> max_output_vertices;
         BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
         BitField<24, 4, u32> reserved;
         BitField<12, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
-    } common4;
+    } common4{};
 
     union {
         struct {
-            INSERT_PADDING_BYTES(3);  // ImapSystemValuesA
-            INSERT_PADDING_BYTES(1);  // ImapSystemValuesB
-            INSERT_PADDING_BYTES(16); // ImapGenericVector[32]
-            INSERT_PADDING_BYTES(2);  // ImapColor
+            INSERT_UNION_PADDING_BYTES(3);  // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // ImapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // ImapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
             union {
                 BitField<0, 8, u16> clip_distances;
                 BitField<8, 1, u16> point_sprite_s;
@@ -79,20 +79,20 @@ struct Header {
                 BitField<14, 1, u16> instance_id;
                 BitField<15, 1, u16> vertex_id;
             };
-            INSERT_PADDING_BYTES(5);  // ImapFixedFncTexture[10]
-            INSERT_PADDING_BYTES(1);  // ImapReserved
-            INSERT_PADDING_BYTES(3);  // OmapSystemValuesA
-            INSERT_PADDING_BYTES(1);  // OmapSystemValuesB
-            INSERT_PADDING_BYTES(16); // OmapGenericVector[32]
-            INSERT_PADDING_BYTES(2);  // OmapColor
-            INSERT_PADDING_BYTES(2);  // OmapSystemValuesC
-            INSERT_PADDING_BYTES(5);  // OmapFixedFncTexture[10]
-            INSERT_PADDING_BYTES(1);  // OmapReserved
+            INSERT_UNION_PADDING_BYTES(5);  // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // ImapReserved
+            INSERT_UNION_PADDING_BYTES(3);  // OmapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // OmapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // OmapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // OmapColor
+            INSERT_UNION_PADDING_BYTES(2);  // OmapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(5);  // OmapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // OmapReserved
         } vtg;
 
         struct {
-            INSERT_PADDING_BYTES(3); // ImapSystemValuesA
-            INSERT_PADDING_BYTES(1); // ImapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB
             union {
                 BitField<0, 2, AttributeUse> x;
                 BitField<2, 2, AttributeUse> y;
@@ -100,10 +100,10 @@ struct Header {
                 BitField<6, 2, AttributeUse> z;
                 u8 raw;
             } imap_generic_vector[32];
-            INSERT_PADDING_BYTES(2);  // ImapColor
-            INSERT_PADDING_BYTES(2);  // ImapSystemValuesC
-            INSERT_PADDING_BYTES(10); // ImapFixedFncTexture[10]
-            INSERT_PADDING_BYTES(2);  // ImapReserved
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
+            INSERT_UNION_PADDING_BYTES(2);  // ImapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(10); // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapReserved
             struct {
                 u32 target;
                 union {
@@ -139,6 +139,8 @@ struct Header {
                 return result;
             }
         } ps;
+
+        std::array<u32, 0xF> raw{};
     };
 
     u64 GetLocalMemorySize() const {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index dbca19f35..ecc338ae9 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -207,7 +207,7 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0x4);
+                INSERT_UNION_PADDING_WORDS(0x4);
                 struct {
                     u32 address_high;
                     u32 address_low;
@@ -220,12 +220,12 @@ public:
 
                 u32 semaphore_sequence;
                 u32 semaphore_trigger;
-                INSERT_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0xC);
 
                 // The puser and the puller share the reference counter, the pusher only has read
                 // access
                 u32 reference_count;
-                INSERT_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x5);
 
                 u32 semaphore_acquire;
                 u32 semaphore_release;
@@ -234,7 +234,7 @@ public:
                     BitField<4, 4, u32> operation;
                     BitField<8, 8, u32> id;
                 } fence_action;
-                INSERT_PADDING_WORDS(0xE2);
+                INSERT_UNION_PADDING_WORDS(0xE2);
 
                 // Puller state
                 u32 acquire_mode;
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index f8a807c84..0375fca17 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,13 +8,17 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
 CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
@@ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz
 CachedBufferBlock::~CachedBufferBlock() = default;
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               std::size_t stream_size)
-    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
-          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
+                               const Device& device, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+    if (!device.HasFastBufferSubData()) {
+        return;
+    }
+
+    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+    for (const GLuint cbuf : cbufs) {
+        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
+    }
+}
 
-OGLBufferCache::~OGLBufferCache() = default;
+OGLBufferCache::~OGLBufferCache() {
+    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+}
 
 Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
     return std::make_shared<CachedBufferBlock>(cache_addr, size);
@@ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
                              static_cast<GLsizeiptr>(size));
 }
 
+OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
+                                                             std::size_t size) {
+    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
+    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
+    return {&cbuf, 0};
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 022e7bfa9..8c7145443 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,10 +4,12 @@
 
 #pragma once
 
+#include <array>
 #include <memory>
 
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -18,12 +20,14 @@ class System;
 
 namespace OpenGL {
 
+class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
 class CachedBufferBlock;
 
 using Buffer = std::shared_ptr<CachedBufferBlock>;
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 
 class CachedBufferBlock : public VideoCommon::BufferBlock {
 public:
@@ -38,14 +42,18 @@ private:
     OGLBuffer gl_buffer{};
 };
 
-class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
+class OGLBufferCache final : public GenericBufferCache {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                            std::size_t stream_size);
+                            const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
     const GLuint* GetEmptyBuffer(std::size_t) override;
 
+    void Acquire() noexcept {
+        cbuf_cursor = 0;
+    }
+
 protected:
     Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
 
@@ -61,6 +69,14 @@ protected:
 
     void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                    std::size_t dst_offset, std::size_t size) override;
+
+    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
+
+private:
+    std::size_t cbuf_cursor = 0;
+    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
+        cbufs;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 64de7e425..c65b24c69 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -51,8 +51,11 @@ bool HasExtension(const std::vector<std::string_view>& images, std::string_view
 } // Anonymous namespace
 
 Device::Device() {
+    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
     const std::vector extensions = GetExtensions();
 
+    const bool is_nvidia = vendor == "NVIDIA Corporation";
+
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -64,6 +67,7 @@ Device::Device() {
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = TestComponentIndexingBug();
     has_precise_bug = TestPreciseBug();
+    has_fast_buffer_sub_data = is_nvidia;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index bb273c3d6..bf35bd0b6 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -54,6 +54,10 @@ public:
         return has_precise_bug;
     }
 
+    bool HasFastBufferSubData() const {
+        return has_fast_buffer_sub_data;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestComponentIndexingBug();
@@ -69,6 +73,7 @@ private:
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
+    bool has_fast_buffer_sub_data{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 6a4d2c83a..e560d70d5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
+      system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
@@ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() {
     SyncPolygonOffset();
     SyncAlphaTest();
 
+    buffer_cache.Acquire();
+
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
 
@@ -879,7 +881,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
     const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
+                                                          device.HasFastBufferSubData());
     bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }
 
@@ -935,10 +938,9 @@ TextureBufferUsage RasterizerOpenGL::SetupDrawTextures(Maxwell::ShaderStage stag
             if (!entry.IsBindless()) {
                 return maxwell3d.GetStageTexture(stage, entry.GetOffset());
             }
-            const auto cbuf = entry.GetBindlessCBuf();
-            Tegra::Texture::TextureHandle tex_handle;
-            Tegra::Engines::ShaderType shader_type = static_cast<Tegra::Engines::ShaderType>(stage);
-            tex_handle.raw = maxwell3d.AccessConstBuffer32(shader_type, cbuf.first, cbuf.second);
+            const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage);
+            const Tegra::Texture::TextureHandle tex_handle =
+                maxwell3d.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
             return maxwell3d.GetTextureInfo(tex_handle);
         }();
 
@@ -966,10 +968,8 @@ TextureBufferUsage RasterizerOpenGL::SetupComputeTextures(const Shader& kernel)
             if (!entry.IsBindless()) {
                 return compute.GetTexture(entry.GetOffset());
             }
-            const auto cbuf = entry.GetBindlessCBuf();
-            Tegra::Texture::TextureHandle tex_handle;
-            tex_handle.raw = compute.AccessConstBuffer32(Tegra::Engines::ShaderType::Compute,
-                                                         cbuf.first, cbuf.second);
+            const Tegra::Texture::TextureHandle tex_handle = compute.AccessConstBuffer32(
+                Tegra::Engines::ShaderType::Compute, entry.GetBuffer(), entry.GetOffset());
             return compute.GetTextureInfo(tex_handle);
         }();
 
@@ -1012,10 +1012,8 @@ void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
             if (!entry.IsBindless()) {
                 return compute.GetTexture(entry.GetOffset()).tic;
             }
-            const auto cbuf = entry.GetBindlessCBuf();
-            Tegra::Texture::TextureHandle tex_handle;
-            tex_handle.raw = compute.AccessConstBuffer32(Tegra::Engines::ShaderType::Compute,
-                                                         cbuf.first, cbuf.second);
+            const Tegra::Texture::TextureHandle tex_handle = compute.AccessConstBuffer32(
+                Tegra::Engines::ShaderType::Compute, entry.GetBuffer(), entry.GetOffset());
             return compute.GetTextureInfo(tex_handle).tic;
         }();
         SetupImage(bindpoint, tic, entry);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 8a514cb8a..0ce59a852 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -735,7 +735,7 @@ private:
 
     void DeclareImages() {
         const auto& images{ir.GetImages()};
-        for (const auto& [offset, image] : images) {
+        for (const auto& image : images) {
             std::string qualifier = "coherent volatile";
             if (image.IsRead() && !image.IsWritten()) {
                 qualifier += " readonly";
@@ -2466,16 +2466,16 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
         entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
                                            cbuf.first);
     }
+    for (const auto& [base, usage] : ir.GetGlobalMemory()) {
+        entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_read,
+                                                   usage.is_written);
+    }
     for (const auto& sampler : ir.GetSamplers()) {
         entries.samplers.emplace_back(sampler);
     }
-    for (const auto& [offset, image] : ir.GetImages()) {
+    for (const auto& image : ir.GetImages()) {
         entries.images.emplace_back(image);
     }
-    for (const auto& [base, usage] : ir.GetGlobalMemory()) {
-        entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_read,
-                                                   usage.is_written);
-    }
     entries.clip_distances = ir.GetClipDistances();
     entries.shader_length = ir.GetLength();
     return entries;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index fead2a51e..b1e75e6cc 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -82,10 +82,9 @@ private:
 
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffers;
+    std::vector<GlobalMemoryEntry> global_memory_entries;
     std::vector<SamplerEntry> samplers;
-    std::vector<SamplerEntry> bindless_samplers;
     std::vector<ImageEntry> images;
-    std::vector<GlobalMemoryEntry> global_memory_entries;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
     std::size_t shader_length{};
 };
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index d47c63d9f..b427ac873 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -16,7 +16,9 @@
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
+
 namespace {
+
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
@@ -68,15 +70,15 @@ struct CFGRebuildState {
     const ProgramCode& program_code;
     ConstBufferLocker& locker;
     u32 start{};
-    std::vector<BlockInfo> block_info{};
-    std::list<u32> inspect_queries{};
-    std::list<Query> queries{};
-    std::unordered_map<u32, u32> registered{};
-    std::set<u32> labels{};
-    std::map<u32, u32> ssy_labels{};
-    std::map<u32, u32> pbk_labels{};
-    std::unordered_map<u32, BlockStack> stacks{};
-    ASTManager* manager;
+    std::vector<BlockInfo> block_info;
+    std::list<u32> inspect_queries;
+    std::list<Query> queries;
+    std::unordered_map<u32, u32> registered;
+    std::set<u32> labels;
+    std::map<u32, u32> ssy_labels;
+    std::map<u32, u32> pbk_labels;
+    std::unordered_map<u32, BlockStack> stacks;
+    ASTManager* manager{};
 };
 
 enum class BlockCollision : u32 { None, Found, Inside };
@@ -109,7 +111,7 @@ BlockInfo& CreateBlockInfo(CFGRebuildState& state, u32 start, u32 end) {
 }
 
 Pred GetPredicate(u32 index, bool negated) {
-    return static_cast<Pred>(index + (negated ? 8 : 0));
+    return static_cast<Pred>(static_cast<u64>(index) + (negated ? 8ULL : 0ULL));
 }
 
 /**
@@ -136,15 +138,13 @@ struct BranchIndirectInfo {
     s32 relative_position{};
 };
 
-std::optional<BranchIndirectInfo> TrackBranchIndirectInfo(const CFGRebuildState& state,
-                                                          u32 start_address, u32 current_position) {
-    const u32 shader_start = state.start;
-    u32 pos = current_position;
-    BranchIndirectInfo result{};
-    u64 track_register = 0;
+struct BufferInfo {
+    u32 index;
+    u32 offset;
+};
 
-    // Step 0 Get BRX Info
-    const Instruction instr = {state.program_code[pos]};
+std::optional<std::pair<s32, u64>> GetBRXInfo(const CFGRebuildState& state, u32& pos) {
+    const Instruction instr = state.program_code[pos];
     const auto opcode = OpCode::Decode(instr);
     if (opcode->get().GetId() != OpCode::Id::BRX) {
         return std::nullopt;
@@ -152,86 +152,94 @@ std::optional<BranchIndirectInfo> TrackBranchIndirectInfo(const CFGRebuildState&
     if (instr.brx.constant_buffer != 0) {
         return std::nullopt;
     }
-    track_register = instr.gpr8.Value();
-    result.relative_position = instr.brx.GetBranchExtend();
-    pos--;
-    bool found_track = false;
+    --pos;
+    return std::make_pair(instr.brx.GetBranchExtend(), instr.gpr8.Value());
+}
 
-    // Step 1 Track LDC
-    while (pos >= shader_start) {
-        if (IsSchedInstruction(pos, shader_start)) {
-            pos--;
+template <typename Result, typename TestCallable, typename PackCallable>
+// requires std::predicate<TestCallable, Instruction, const OpCode::Matcher&>
+// requires std::invocable<PackCallable, Instruction, const OpCode::Matcher&>
+std::optional<Result> TrackInstruction(const CFGRebuildState& state, u32& pos, TestCallable test,
+                                       PackCallable pack) {
+    for (; pos >= state.start; --pos) {
+        if (IsSchedInstruction(pos, state.start)) {
             continue;
         }
-        const Instruction instr = {state.program_code[pos]};
+        const Instruction instr = state.program_code[pos];
         const auto opcode = OpCode::Decode(instr);
-        if (opcode->get().GetId() == OpCode::Id::LD_C) {
-            if (instr.gpr0.Value() == track_register &&
-                instr.ld_c.type.Value() == Tegra::Shader::UniformType::Single) {
-                result.buffer = instr.cbuf36.index.Value();
-                result.offset = static_cast<u32>(instr.cbuf36.GetOffset());
-                track_register = instr.gpr8.Value();
-                pos--;
-                found_track = true;
-                break;
-            }
+        if (!opcode) {
+            continue;
+        }
+        if (test(instr, opcode->get())) {
+            --pos;
+            return std::make_optional(pack(instr, opcode->get()));
         }
-        pos--;
     }
+    return std::nullopt;
+}
 
-    if (!found_track) {
-        return std::nullopt;
-    }
-    found_track = false;
+std::optional<std::pair<BufferInfo, u64>> TrackLDC(const CFGRebuildState& state, u32& pos,
+                                                   u64 brx_tracked_register) {
+    return TrackInstruction<std::pair<BufferInfo, u64>>(
+        state, pos,
+        [brx_tracked_register](auto instr, const auto& opcode) {
+            return opcode.GetId() == OpCode::Id::LD_C &&
+                   instr.gpr0.Value() == brx_tracked_register &&
+                   instr.ld_c.type.Value() == Tegra::Shader::UniformType::Single;
+        },
+        [](auto instr, const auto& opcode) {
+            const BufferInfo info = {static_cast<u32>(instr.cbuf36.index.Value()),
+                                     static_cast<u32>(instr.cbuf36.GetOffset())};
+            return std::make_pair(info, instr.gpr8.Value());
+        });
+}
 
-    // Step 2 Track SHL
-    while (pos >= shader_start) {
-        if (IsSchedInstruction(pos, shader_start)) {
-            pos--;
-            continue;
-        }
-        const Instruction instr = state.program_code[pos];
-        const auto opcode = OpCode::Decode(instr);
-        if (opcode->get().GetId() == OpCode::Id::SHL_IMM) {
-            if (instr.gpr0.Value() == track_register) {
-                track_register = instr.gpr8.Value();
-                pos--;
-                found_track = true;
-                break;
-            }
-        }
-        pos--;
+std::optional<u64> TrackSHLRegister(const CFGRebuildState& state, u32& pos,
+                                    u64 ldc_tracked_register) {
+    return TrackInstruction<u64>(state, pos,
+                                 [ldc_tracked_register](auto instr, const auto& opcode) {
+                                     return opcode.GetId() == OpCode::Id::SHL_IMM &&
+                                            instr.gpr0.Value() == ldc_tracked_register;
+                                 },
+                                 [](auto instr, const auto&) { return instr.gpr8.Value(); });
+}
+
+std::optional<u32> TrackIMNMXValue(const CFGRebuildState& state, u32& pos,
+                                   u64 shl_tracked_register) {
+    return TrackInstruction<u32>(state, pos,
+                                 [shl_tracked_register](auto instr, const auto& opcode) {
+                                     return opcode.GetId() == OpCode::Id::IMNMX_IMM &&
+                                            instr.gpr0.Value() == shl_tracked_register;
+                                 },
+                                 [](auto instr, const auto&) {
+                                     return static_cast<u32>(instr.alu.GetSignedImm20_20() + 1);
+                                 });
+}
+
+std::optional<BranchIndirectInfo> TrackBranchIndirectInfo(const CFGRebuildState& state, u32 pos) {
+    const auto brx_info = GetBRXInfo(state, pos);
+    if (!brx_info) {
+        return std::nullopt;
     }
+    const auto [relative_position, brx_tracked_register] = *brx_info;
 
-    if (!found_track) {
+    const auto ldc_info = TrackLDC(state, pos, brx_tracked_register);
+    if (!ldc_info) {
         return std::nullopt;
     }
-    found_track = false;
+    const auto [buffer_info, ldc_tracked_register] = *ldc_info;
 
-    // Step 3 Track IMNMX
-    while (pos >= shader_start) {
-        if (IsSchedInstruction(pos, shader_start)) {
-            pos--;
-            continue;
-        }
-        const Instruction instr = state.program_code[pos];
-        const auto opcode = OpCode::Decode(instr);
-        if (opcode->get().GetId() == OpCode::Id::IMNMX_IMM) {
-            if (instr.gpr0.Value() == track_register) {
-                track_register = instr.gpr8.Value();
-                result.entries = instr.alu.GetSignedImm20_20() + 1;
-                pos--;
-                found_track = true;
-                break;
-            }
-        }
-        pos--;
+    const auto shl_tracked_register = TrackSHLRegister(state, pos, ldc_tracked_register);
+    if (!shl_tracked_register) {
+        return std::nullopt;
     }
 
-    if (!found_track) {
+    const auto entries = TrackIMNMXValue(state, pos, *shl_tracked_register);
+    if (!entries) {
         return std::nullopt;
     }
-    return result;
+
+    return BranchIndirectInfo{buffer_info.index, buffer_info.offset, *entries, relative_position};
 }
 
 std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) {
@@ -420,30 +428,30 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address)
             break;
         }
         case OpCode::Id::BRX: {
-            auto tmp = TrackBranchIndirectInfo(state, address, offset);
-            if (tmp) {
-                auto result = *tmp;
-                std::vector<CaseBranch> branches{};
-                s32 pc_target = offset + result.relative_position;
-                for (u32 i = 0; i < result.entries; i++) {
-                    auto k = state.locker.ObtainKey(result.buffer, result.offset + i * 4);
-                    if (!k) {
-                        return {ParseResult::AbnormalFlow, parse_info};
-                    }
-                    u32 value = *k;
-                    u32 target = static_cast<u32>((value >> 3) + pc_target);
-                    insert_label(state, target);
-                    branches.emplace_back(value, target);
-                }
-                parse_info.end_address = offset;
-                parse_info.branch_info = MakeBranchInfo<MultiBranch>(
-                    static_cast<u32>(instr.gpr8.Value()), std::move(branches));
-
-                return {ParseResult::ControlCaught, parse_info};
-            } else {
+            const auto tmp = TrackBranchIndirectInfo(state, offset);
+            if (!tmp) {
                 LOG_WARNING(HW_GPU, "BRX Track Unsuccesful");
+                return {ParseResult::AbnormalFlow, parse_info};
             }
-            return {ParseResult::AbnormalFlow, parse_info};
+
+            const auto result = *tmp;
+            const s32 pc_target = offset + result.relative_position;
+            std::vector<CaseBranch> branches;
+            for (u32 i = 0; i < result.entries; i++) {
+                auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4);
+                if (!key) {
+                    return {ParseResult::AbnormalFlow, parse_info};
+                }
+                u32 value = *key;
+                u32 target = static_cast<u32>((value >> 3) + pc_target);
+                insert_label(state, target);
+                branches.emplace_back(value, target);
+            }
+            parse_info.end_address = offset;
+            parse_info.branch_info = MakeBranchInfo<MultiBranch>(
+                static_cast<u32>(instr.gpr8.Value()), std::move(branches));
+
+            return {ParseResult::ControlCaught, parse_info};
         }
         default:
             break;
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 1473c282a..fcedd2af6 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -43,12 +43,12 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
     case OpCode::Id::FMUL_IMM: {
         // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
         if (instr.fmul.tab5cb8_2 != 0) {
-            LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
-                        instr.fmul.tab5cb8_2.Value());
+            LOG_DEBUG(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+                      instr.fmul.tab5cb8_2.Value());
         }
         if (instr.fmul.tab5c68_0 != 1) {
-            LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
-                        instr.fmul.tab5c68_0.Value());
+            LOG_DEBUG(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+                      instr.fmul.tab5c68_0.Value());
         }
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
@@ -144,10 +144,11 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
     case OpCode::Id::RRO_C:
     case OpCode::Id::RRO_R:
     case OpCode::Id::RRO_IMM: {
+        LOG_DEBUG(HW_GPU, "(STUBBED) RRO used");
+
         // Currently RRO is only implemented as a register move.
         op_b = GetOperandAbsNegFloat(op_b, instr.alu.abs_b, instr.alu.negate_b);
         SetRegister(bb, instr.gpr0, op_b);
-        LOG_WARNING(HW_GPU, "RRO instruction is incomplete");
         break;
     }
     default:
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index b06cbe441..ee7d9a29d 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -21,8 +21,8 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
         opcode->get().GetId() == OpCode::Id::HADD2_R) {
-        if (instr.alu_half.ftz != 0) {
-            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        if (instr.alu_half.ftz == 0) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
     }
 
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 6466fc011..d179b9873 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -19,12 +19,12 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) {
-        if (instr.alu_half_imm.ftz != 0) {
-            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        if (instr.alu_half_imm.ftz == 0) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
     } else {
-        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
-            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::FTZ) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
     }
 
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index ca2f39e8d..5973588d6 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -19,10 +19,10 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
 
     UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
     if (instr.ffma.tab5980_0 != 1) {
-        LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+        LOG_DEBUG(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
     }
     if (instr.ffma.tab5980_1 != 0) {
-        LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+        LOG_DEBUG(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
     }
 
     const Node op_a = GetRegister(instr.gpr8);
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 48ca7a4af..848e46874 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -20,8 +20,8 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz != 0) {
-        LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+    if (instr.hset2.ftz == 0) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
     }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index fec8f2dbe..310655619 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -19,7 +19,9 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    LOG_DEBUG(HW_GPU, "ftz={}", static_cast<u32>(instr.hsetp2.ftz));
+    if (instr.hsetp2.ftz != 0) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index b02d2cb95..d2fe4ec5d 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -143,39 +143,37 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
 }
 
 Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) {
-    const auto offset{static_cast<std::size_t>(image.index.Value())};
-    if (const auto existing_image = TryUseExistingImage(offset, type)) {
-        return *existing_image;
+    const auto offset = static_cast<u32>(image.index.Value());
+
+    const auto it =
+        std::find_if(std::begin(used_images), std::end(used_images),
+                     [offset](const Image& entry) { return entry.GetOffset() == offset; });
+    if (it != std::end(used_images)) {
+        ASSERT(!it->IsBindless() && it->GetType() == it->GetType());
+        return *it;
     }
 
-    const std::size_t next_index{used_images.size()};
-    return used_images.emplace(offset, Image{offset, next_index, type}).first->second;
+    const auto next_index = static_cast<u32>(used_images.size());
+    return used_images.emplace_back(next_index, offset, type);
 }
 
 Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) {
-    const Node image_register{GetRegister(reg)};
-    const auto [base_image, cbuf_index, cbuf_offset]{
-        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};
-
-    if (const auto image = TryUseExistingImage(cbuf_key, type)) {
-        return *image;
-    }
-
-    const std::size_t next_index{used_images.size()};
-    return used_images.emplace(cbuf_key, Image{cbuf_index, cbuf_offset, next_index, type})
-        .first->second;
-}
-
-Image* ShaderIR::TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type) {
-    auto it = used_images.find(offset);
-    if (it == used_images.end()) {
-        return nullptr;
+    const Node image_register = GetRegister(reg);
+    const auto [base_image, buffer, offset] =
+        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()));
+
+    const auto it =
+        std::find_if(std::begin(used_images), std::end(used_images),
+                     [buffer = buffer, offset = offset](const Image& entry) {
+                         return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
+                     });
+    if (it != std::end(used_images)) {
+        ASSERT(it->IsBindless() && it->GetType() == it->GetType());
+        return *it;
     }
-    auto& image = it->second;
-    ASSERT(image.GetType() == type);
 
-    return &image;
+    const auto next_index = static_cast<u32>(used_images.size());
+    return used_images.emplace_back(next_index, offset, buffer, type);
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 0599ef34f..bb926a132 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -44,10 +44,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     bool is_bindless = false;
     switch (opcode->get().GetId()) {
     case OpCode::Id::TEX: {
-        if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
-        }
-
         const TextureType texture_type{instr.tex.texture_type};
         const bool is_array = instr.tex.array != 0;
         const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
@@ -62,10 +58,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI),
                              "AOFFI is not implemented");
 
-        if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
-        }
-
         const TextureType texture_type{instr.tex_b.texture_type};
         const bool is_array = instr.tex_b.array != 0;
         const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
@@ -82,10 +74,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         const bool depth_compare = instr.texs.UsesMiscMode(TextureMiscMode::DC);
         const auto process_mode = instr.texs.GetTextureProcessMode();
 
-        if (instr.texs.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TEXS.NODEP implementation is incomplete");
-        }
-
         const Node4 components =
             GetTexsCode(instr, texture_type, process_mode, depth_compare, is_array);
 
@@ -107,10 +95,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP),
                              "PTP is not implemented");
 
-        if (instr.tld4.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TLD4.NODEP implementation is incomplete");
-        }
-
         const auto texture_type = instr.tld4.texture_type.Value();
         const bool depth_compare = is_bindless ? instr.tld4_b.UsesMiscMode(TextureMiscMode::DC)
                                                : instr.tld4.UsesMiscMode(TextureMiscMode::DC);
@@ -119,15 +103,12 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                                           : instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI);
         WriteTexInstructionFloat(
             bb, instr,
-            GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi, is_bindless), true);
+            GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi, is_bindless));
         break;
     }
     case OpCode::Id::TLD4S: {
         UNIMPLEMENTED_IF_MSG(instr.tld4s.UsesMiscMode(TextureMiscMode::AOFFI),
                              "AOFFI is not implemented");
-        if (instr.tld4s.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TLD4S.NODEP implementation is incomplete");
-        }
 
         const bool depth_compare = instr.tld4s.UsesMiscMode(TextureMiscMode::DC);
         const Node op_a = GetRegister(instr.gpr8);
@@ -164,10 +145,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         is_bindless = true;
         [[fallthrough]];
     case OpCode::Id::TXQ: {
-        if (instr.txq.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TXQ.NODEP implementation is incomplete");
-        }
-
         // TODO: The new commits on the texture refactor, change the way samplers work.
         // Sadly, not all texture instructions specify the type of texture their sampler
         // uses. This must be fixed at a later instance.
@@ -205,10 +182,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
                              "NDV is not implemented");
 
-        if (instr.tmml.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TMML.NODEP implementation is incomplete");
-        }
-
         auto texture_type = instr.tmml.texture_type.Value();
         const bool is_array = instr.tmml.array != 0;
         const auto& sampler =
@@ -254,25 +227,17 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.tld.ms, "MS is not implemented");
         UNIMPLEMENTED_IF_MSG(instr.tld.cl, "CL is not implemented");
 
-        if (instr.tld.nodep_flag) {
-            LOG_WARNING(HW_GPU, "TLD.NODEP implementation is incomplete");
-        }
-
         WriteTexInstructionFloat(bb, instr, GetTldCode(instr));
         break;
     }
     case OpCode::Id::TLDS: {
-        const Tegra::Shader::TextureType texture_type{instr.tlds.GetTextureType()};
+        const TextureType texture_type{instr.tlds.GetTextureType()};
         const bool is_array{instr.tlds.IsArrayTexture()};
 
         UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(TextureMiscMode::AOFFI),
                              "AOFFI is not implemented");
         UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(TextureMiscMode::MZ), "MZ is not implemented");
 
-        if (instr.tlds.UsesMiscMode(TextureMiscMode::NODEP)) {
-            LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete");
-        }
-
         const Node4 components = GetTldsCode(instr, texture_type, is_array);
 
         if (instr.tlds.fp32_flag) {
@@ -293,84 +258,86 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
                                     std::optional<SamplerInfo> sampler_info) {
     const auto offset = static_cast<u32>(sampler.index.Value());
 
-    Tegra::Shader::TextureType type;
+    TextureType type;
     bool is_array;
     bool is_shadow;
     if (sampler_info) {
         type = sampler_info->type;
         is_array = sampler_info->is_array;
         is_shadow = sampler_info->is_shadow;
-    } else if (auto sampler = locker.ObtainBoundSampler(offset); sampler) {
+    } else if (const auto sampler = locker.ObtainBoundSampler(offset)) {
         type = sampler->texture_type.Value();
         is_array = sampler->is_array.Value() != 0;
         is_shadow = sampler->is_shadow.Value() != 0;
     } else {
-        type = Tegra::Shader::TextureType::Texture2D;
+        LOG_WARNING(HW_GPU, "Unknown sampler info");
+        type = TextureType::Texture2D;
         is_array = false;
         is_shadow = false;
     }
 
     // If this sampler has already been used, return the existing mapping.
-    const auto itr =
+    const auto it =
         std::find_if(used_samplers.begin(), used_samplers.end(),
-                     [&](const Sampler& entry) { return entry.GetOffset() == offset; });
-    if (itr != used_samplers.end()) {
-        ASSERT(itr->GetType() == type && itr->IsArray() == is_array &&
-               itr->IsShadow() == is_shadow);
-        return *itr;
+                     [offset](const Sampler& entry) { return entry.GetOffset() == offset; });
+    if (it != used_samplers.end()) {
+        ASSERT(!it->IsBindless() && it->GetType() == type && it->IsArray() == is_array &&
+               it->IsShadow() == is_shadow);
+        return *it;
     }
 
     // Otherwise create a new mapping for this sampler
-    const std::size_t next_index = used_samplers.size();
-    const Sampler entry{offset, next_index, type, is_array, is_shadow};
-    return *used_samplers.emplace(entry).first;
-} // namespace VideoCommon::Shader
+    const auto next_index = static_cast<u32>(used_samplers.size());
+    return used_samplers.emplace_back(Sampler(next_index, offset, type, is_array, is_shadow));
+}
 
 const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg,
                                             std::optional<SamplerInfo> sampler_info) {
     const Node sampler_register = GetRegister(reg);
-    const auto [base_sampler, cbuf_index, cbuf_offset] =
+    const auto [base_sampler, buffer, offset] =
         TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
     ASSERT(base_sampler != nullptr);
-    const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);
-    Tegra::Shader::TextureType type;
+
+    TextureType type;
     bool is_array;
     bool is_shadow;
     if (sampler_info) {
         type = sampler_info->type;
         is_array = sampler_info->is_array;
         is_shadow = sampler_info->is_shadow;
-    } else if (auto sampler = locker.ObtainBindlessSampler(cbuf_index, cbuf_offset); sampler) {
+    } else if (const auto sampler = locker.ObtainBindlessSampler(buffer, offset)) {
         type = sampler->texture_type.Value();
         is_array = sampler->is_array.Value() != 0;
         is_shadow = sampler->is_shadow.Value() != 0;
     } else {
-        type = Tegra::Shader::TextureType::Texture2D;
+        LOG_WARNING(HW_GPU, "Unknown sampler info");
+        type = TextureType::Texture2D;
         is_array = false;
         is_shadow = false;
     }
 
     // If this sampler has already been used, return the existing mapping.
-    const auto itr =
+    const auto it =
         std::find_if(used_samplers.begin(), used_samplers.end(),
-                     [&](const Sampler& entry) { return entry.GetOffset() == cbuf_key; });
-    if (itr != used_samplers.end()) {
-        ASSERT(itr->GetType() == type && itr->IsArray() == is_array &&
-               itr->IsShadow() == is_shadow);
-        return *itr;
+                     [buffer = buffer, offset = offset](const Sampler& entry) {
+                         return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
+                     });
+    if (it != used_samplers.end()) {
+        ASSERT(it->IsBindless() && it->GetType() == type && it->IsArray() == is_array &&
+               it->IsShadow() == is_shadow);
+        return *it;
     }
 
     // Otherwise create a new mapping for this sampler
-    const std::size_t next_index = used_samplers.size();
-    const Sampler entry{cbuf_index, cbuf_offset, next_index, type, is_array, is_shadow};
-    return *used_samplers.emplace(entry).first;
+    const auto next_index = static_cast<u32>(used_samplers.size());
+    return used_samplers.emplace_back(
+        Sampler(next_index, offset, buffer, type, is_array, is_shadow));
 }
 
-void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components,
-                                        bool is_tld4) {
+void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
     u32 dest_elem = 0;
     for (u32 elem = 0; elem < 4; ++elem) {
-        if (!is_tld4 && !instr.tex.IsComponentEnabled(elem)) {
+        if (!instr.tex.IsComponentEnabled(elem)) {
             // Skip disabled components
             continue;
         }
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 447fb5c1d..4300d9ff4 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -230,62 +230,49 @@ using NodeBlock = std::vector<Node>;
 class Sampler {
 public:
     /// This constructor is for bound samplers
-    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
-                     bool is_array, bool is_shadow)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
-          is_bindless{false} {}
+    constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type,
+                               bool is_array, bool is_shadow)
+        : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow} {}
 
     /// This constructor is for bindless samplers
-    explicit Sampler(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
-                     Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
-        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
-          is_array{is_array}, is_shadow{is_shadow}, is_bindless{true} {}
-
-    /// This constructor is for serialization/deserialization
-    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
-                     bool is_array, bool is_shadow, bool is_bindless)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
-          is_bindless{is_bindless} {}
-
-    std::size_t GetOffset() const {
+    constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
+                               bool is_array, bool is_shadow)
+        : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
+          is_shadow{is_shadow}, is_bindless{true} {}
+
+    constexpr u32 GetIndex() const {
+        return index;
+    }
+
+    constexpr u32 GetOffset() const {
         return offset;
     }
 
-    std::size_t GetIndex() const {
-        return index;
+    constexpr u32 GetBuffer() const {
+        return buffer;
     }
 
-    Tegra::Shader::TextureType GetType() const {
+    constexpr Tegra::Shader::TextureType GetType() const {
         return type;
     }
 
-    bool IsArray() const {
+    constexpr bool IsArray() const {
         return is_array;
     }
 
-    bool IsShadow() const {
+    constexpr bool IsShadow() const {
         return is_shadow;
     }
 
-    bool IsBindless() const {
+    constexpr bool IsBindless() const {
         return is_bindless;
     }
 
-    std::pair<u32, u32> GetBindlessCBuf() const {
-        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
-    }
-
-    bool operator<(const Sampler& rhs) const {
-        return std::tie(index, offset, type, is_array, is_shadow, is_bindless) <
-               std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow,
-                        rhs.is_bindless);
-    }
-
 private:
-    /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
-    /// instruction.
-    std::size_t offset{};
-    std::size_t index{}; ///< Value used to index into the generated GLSL sampler array.
+    u32 index{};  ///< Emulated index given for the this sampler.
+    u32 offset{}; ///< Offset in the const buffer from where the sampler is being read.
+    u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
+
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
     bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
     bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
@@ -294,18 +281,13 @@ private:
 
 class Image final {
 public:
-    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type)
-        : offset{offset}, index{index}, type{type}, is_bindless{false} {}
-
-    constexpr explicit Image(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
-                             Tegra::Shader::ImageType type)
-        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
-          is_bindless{true} {}
+    /// This constructor is for bound images
+    constexpr explicit Image(u32 index, u32 offset, Tegra::Shader::ImageType type)
+        : index{index}, offset{offset}, type{type} {}
 
-    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
-                             bool is_bindless, bool is_written, bool is_read, bool is_atomic)
-        : offset{offset}, index{index}, type{type}, is_bindless{is_bindless},
-          is_written{is_written}, is_read{is_read}, is_atomic{is_atomic} {}
+    /// This constructor is for bindless samplers
+    constexpr explicit Image(u32 index, u32 offset, u32 buffer, Tegra::Shader::ImageType type)
+        : index{index}, offset{offset}, buffer{buffer}, type{type}, is_bindless{true} {}
 
     void MarkWrite() {
         is_written = true;
@@ -321,12 +303,16 @@ public:
         is_atomic = true;
     }
 
-    constexpr std::size_t GetOffset() const {
+    constexpr u32 GetIndex() const {
+        return index;
+    }
+
+    constexpr u32 GetOffset() const {
         return offset;
     }
 
-    constexpr std::size_t GetIndex() const {
-        return index;
+    constexpr u32 GetBuffer() const {
+        return buffer;
     }
 
     constexpr Tegra::Shader::ImageType GetType() const {
@@ -349,18 +335,11 @@ public:
         return is_atomic;
     }
 
-    constexpr std::pair<u32, u32> GetBindlessCBuf() const {
-        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
-    }
-
-    constexpr bool operator<(const Image& rhs) const {
-        return std::tie(offset, index, type, is_bindless) <
-               std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_bindless);
-    }
-
 private:
-    u64 offset{};
-    std::size_t index{};
+    u32 index{};
+    u32 offset{};
+    u32 buffer{};
+
     Tegra::Shader::ImageType type{};
     bool is_bindless{};
     bool is_written{};
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 7582999a5..26c8fde22 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <array>
+#include <list>
 #include <map>
 #include <optional>
 #include <set>
@@ -95,11 +96,11 @@ public:
         return used_cbufs;
     }
 
-    const std::set<Sampler>& GetSamplers() const {
+    const std::list<Sampler>& GetSamplers() const {
         return used_samplers;
     }
 
-    const std::map<u64, Image>& GetImages() const {
+    const std::list<Image>& GetImages() const {
         return used_images;
     }
 
@@ -316,9 +317,6 @@ private:
     /// Access a bindless image sampler.
     Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type);
 
-    /// Tries to access an existing image, updating it's state as needed
-    Image* TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type);
-
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
 
@@ -326,7 +324,7 @@ private:
     Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
 
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
-                                  const Node4& components, bool is_tld4 = false);
+                                  const Node4& components);
 
     void WriteTexsInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                    const Node4& components, bool ignore_mask = false);
@@ -402,8 +400,8 @@ private:
     std::set<Tegra::Shader::Attribute::Index> used_input_attributes;
     std::set<Tegra::Shader::Attribute::Index> used_output_attributes;
     std::map<u32, ConstBuffer> used_cbufs;
-    std::set<Sampler> used_samplers;
-    std::map<u64, Image> used_images;
+    std::list<Sampler> used_samplers;
+    std::list<Image> used_images;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
     bool uses_layer{};
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 0429af9c1..27c8ce975 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -132,6 +132,8 @@ enum class SwizzleSource : u32 {
 };
 
 union TextureHandle {
+    TextureHandle(u32 raw) : raw{raw} {}
+
     u32 raw;
     BitField<0, 20, u32> tic_id;
     BitField<20, 12, u32> tsc_id;