19 files changed, 713 insertions, 405 deletions
diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp
deleted file mode 100644
index 0638be8cb..000000000
--- a/src/video_core/shader/const_buffer_locker.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <tuple>
-
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/shader/const_buffer_locker.h"
-
-namespace VideoCommon::Shader {
-
-using Tegra::Engines::SamplerDescriptor;
-
-ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage)
-    : stage{shader_stage} {}
-
-ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
-                                     Tegra::Engines::ConstBufferEngineInterface& engine)
-    : stage{shader_stage}, engine{&engine} {}
-
-ConstBufferLocker::~ConstBufferLocker() = default;
-
-std::optional<u32> ConstBufferLocker::ObtainKey(u32 buffer, u32 offset) {
-    const std::pair<u32, u32> key = {buffer, offset};
-    const auto iter = keys.find(key);
-    if (iter != keys.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
-    keys.emplace(key, value);
-    return value;
-}
-
-std::optional<SamplerDescriptor> ConstBufferLocker::ObtainBoundSampler(u32 offset) {
-    const u32 key = offset;
-    const auto iter = bound_samplers.find(key);
-    if (iter != bound_samplers.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
-    bound_samplers.emplace(key, value);
-    return value;
-}
-
-std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindlessSampler(
-    u32 buffer, u32 offset) {
-    const std::pair key = {buffer, offset};
-    const auto iter = bindless_samplers.find(key);
-    if (iter != bindless_samplers.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
-    bindless_samplers.emplace(key, value);
-    return value;
-}
-
-std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() {
-    if (bound_buffer_saved) {
-        return bound_buffer;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    bound_buffer_saved = true;
-    bound_buffer = engine->GetBoundBuffer();
-    return bound_buffer;
-}
-
-void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) {
-    keys.insert_or_assign({buffer, offset}, value);
-}
-
-void ConstBufferLocker::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
-    bound_samplers.insert_or_assign(offset, sampler);
-}
-
-void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
-    bindless_samplers.insert_or_assign({buffer, offset}, sampler);
-}
-
-void ConstBufferLocker::SetBoundBuffer(u32 buffer) {
-    bound_buffer_saved = true;
-    bound_buffer = buffer;
-}
-
-bool ConstBufferLocker::IsConsistent() const {
-    if (!engine) {
-        return false;
-    }
-    return std::all_of(keys.begin(), keys.end(),
-                       [this](const auto& pair) {
-                           const auto [cbuf, offset] = pair.first;
-                           const auto value = pair.second;
-                           return value == engine->AccessConstBuffer32(stage, cbuf, offset);
-                       }) &&
-           std::all_of(bound_samplers.begin(), bound_samplers.end(),
-                       [this](const auto& sampler) {
-                           const auto [key, value] = sampler;
-                           return value == engine->AccessBoundSampler(stage, key);
-                       }) &&
-           std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
-                       [this](const auto& sampler) {
-                           const auto [cbuf, offset] = sampler.first;
-                           const auto value = sampler.second;
-                           return value == engine->AccessBindlessSampler(stage, cbuf, offset);
-                       });
-}
-
-bool ConstBufferLocker::HasEqualKeys(const ConstBufferLocker& rhs) const {
-    return std::tie(keys, bound_samplers, bindless_samplers) ==
-           std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
-}
-
-} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h
deleted file mode 100644
index d3ea11087..000000000
--- a/src/video_core/shader/const_buffer_locker.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <optional>
-#include <unordered_map>
-#include "common/common_types.h"
-#include "common/hash.h"
-#include "video_core/engines/const_buffer_engine_interface.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/guest_driver.h"
-
-namespace VideoCommon::Shader {
-
-using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
-using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
-using BindlessSamplerMap =
-    std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
-
-/**
- * The ConstBufferLocker is a class use to interface the 3D and compute engines with the shader
- * compiler. with it, the shader can obtain required data from GPU state and store it for disk
- * shader compilation.
- */
-class ConstBufferLocker {
-public:
-    explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage);
-
-    explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
-                               Tegra::Engines::ConstBufferEngineInterface& engine);
-
-    ~ConstBufferLocker();
-
-    /// Retrieves a key from the locker, if it's registered, it will give the registered value, if
-    /// not it will obtain it from maxwell3d and register it.
-    std::optional<u32> ObtainKey(u32 buffer, u32 offset);
-
-    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
-
-    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
-
-    std::optional<u32> ObtainBoundBuffer();
-
-    /// Inserts a key.
-    void InsertKey(u32 buffer, u32 offset, u32 value);
-
-    /// Inserts a bound sampler key.
-    void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
-
-    /// Inserts a bindless sampler key.
-    void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
-
-    /// Set the bound buffer for this locker.
-    void SetBoundBuffer(u32 buffer);
-
-    /// Checks keys and samplers against engine's current const buffers. Returns true if they are
-    /// the same value, false otherwise;
-    bool IsConsistent() const;
-
-    /// Returns true if the keys are equal to the other ones in the locker.
-    bool HasEqualKeys(const ConstBufferLocker& rhs) const;
-
-    /// Gives an getter to the const buffer keys in the database.
-    const KeyMap& GetKeys() const {
-        return keys;
-    }
-
-    /// Gets samplers database.
-    const BoundSamplerMap& GetBoundSamplers() const {
-        return bound_samplers;
-    }
-
-    /// Gets bindless samplers database.
-    const BindlessSamplerMap& GetBindlessSamplers() const {
-        return bindless_samplers;
-    }
-
-    /// Gets bound buffer used on this shader
-    u32 GetBoundBuffer() const {
-        return bound_buffer;
-    }
-
-    /// Obtains access to the guest driver's profile.
-    VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const {
-        if (engine) {
-            return &engine->AccessGuestDriverProfile();
-        }
-        return nullptr;
-    }
-
-private:
-    const Tegra::Engines::ShaderType stage;
-    Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
-    KeyMap keys;
-    BoundSamplerMap bound_samplers;
-    BindlessSamplerMap bindless_samplers;
-    bool bound_buffer_saved{};
-    u32 bound_buffer{};
-};
-
-} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index 0229733b6..2e2711350 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -64,11 +65,11 @@ struct BlockInfo {
 };
 
 struct CFGRebuildState {
-    explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker)
-        : program_code{program_code}, locker{locker}, start{start} {}
+    explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry)
+        : program_code{program_code}, registry{registry}, start{start} {}
 
     const ProgramCode& program_code;
-    ConstBufferLocker& locker;
+    Registry& registry;
     u32 start{};
     std::vector<BlockInfo> block_info;
     std::list<u32> inspect_queries;
@@ -438,7 +439,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address)
             const s32 pc_target = offset + result.relative_position;
             std::vector<CaseBranch> branches;
             for (u32 i = 0; i < result.entries; i++) {
-                auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4);
+                auto key = state.registry.ObtainKey(result.buffer, result.offset + i * 4);
                 if (!key) {
                     return {ParseResult::AbnormalFlow, parse_info};
                 }
@@ -656,14 +657,14 @@ void DecompileShader(CFGRebuildState& state) {
 
 std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
                                                 const CompilerSettings& settings,
-                                                ConstBufferLocker& locker) {
+                                                Registry& registry) {
     auto result_out = std::make_unique<ShaderCharacteristics>();
     if (settings.depth == CompileDepth::BruteForce) {
         result_out->settings.depth = CompileDepth::BruteForce;
         return result_out;
     }
 
-    CFGRebuildState state{program_code, start_address, locker};
+    CFGRebuildState state{program_code, start_address, registry};
     // Inspect Code and generate blocks
     state.labels.clear();
     state.labels.emplace(start_address);
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
index 5304998b9..62a3510d8 100644
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -12,6 +12,7 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -111,6 +112,6 @@ struct ShaderCharacteristics {
 
 std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
                                                 const CompilerSettings& settings,
-                                                ConstBufferLocker& locker);
+                                                Registry& registry);
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 6b697ed5d..87ac9ac6c 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -34,13 +34,9 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
     return (absolute_offset % SchedPeriod) == 0;
 }
 
-void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
+void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
                               const std::list<Sampler>& used_samplers) {
-    if (gpu_driver == nullptr) {
-        LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet");
-        return;
-    }
-    if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) {
+    if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) {
         return;
     }
     u32 count{};
@@ -53,17 +49,13 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
         bound_offsets.emplace_back(sampler.GetOffset());
     }
     if (count > 1) {
-        gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets));
+        gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets));
     }
 }
 
 std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
-                                        VideoCore::GuestDriverProfile* gpu_driver,
+                                        VideoCore::GuestDriverProfile& gpu_driver,
                                         const std::list<Sampler>& used_samplers) {
-    if (gpu_driver == nullptr) {
-        LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet");
-        return std::nullopt;
-    }
     const u32 base_offset = sampler_to_deduce.GetOffset();
     u32 max_offset{std::numeric_limits<u32>::max()};
     for (const auto& sampler : used_samplers) {
@@ -77,7 +69,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
     if (max_offset == std::numeric_limits<u32>::max()) {
         return std::nullopt;
     }
-    return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize();
+    return ((max_offset - base_offset) * 4) / gpu_driver.GetTextureHandlerSize();
 }
 
 } // Anonymous namespace
@@ -149,7 +141,7 @@ void ShaderIR::Decode() {
     std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
 
     decompiled = false;
-    auto info = ScanFlow(program_code, main_offset, settings, locker);
+    auto info = ScanFlow(program_code, main_offset, settings, registry);
     auto& shader_info = *info;
     coverage_begin = shader_info.start;
     coverage_end = shader_info.end;
@@ -364,7 +356,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
 
 void ShaderIR::PostDecode() {
     // Deduce texture handler size if needed
-    auto gpu_driver = locker.AccessGuestDriverProfile();
+    auto gpu_driver = registry.AccessGuestDriverProfile();
     DeduceTextureHandlerSize(gpu_driver, used_samplers);
     // Deduce Indexed Samplers
     if (!uses_indexed_samplers) {
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 90240c765..478394682 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -53,29 +53,24 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
 
-        // TODO(Rodrigo): Should precise be used when there's a postfactor?
-        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+        static constexpr std::array FmulPostFactor = {
+            1.000f, // None
+            0.500f, // Divide 2
+            0.250f, // Divide 4
+            0.125f, // Divide 8
+            8.000f, // Mul 8
+            4.000f, // Mul 4
+            2.000f, // Mul 2
+        };
 
         if (instr.fmul.postfactor != 0) {
-            auto postfactor = static_cast<s32>(instr.fmul.postfactor);
-
-            // Postfactor encoded as 3-bit 1's complement in instruction, interpreted with below
-            // logic.
-            if (postfactor >= 4) {
-                postfactor = 7 - postfactor;
-            } else {
-                postfactor = 0 - postfactor;
-            }
-
-            if (postfactor > 0) {
-                value = Operation(OperationCode::FMul, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << postfactor)));
-            } else {
-                value = Operation(OperationCode::FDiv, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << -postfactor)));
-            }
+            op_a = Operation(OperationCode::FMul, NO_PRECISE, op_a,
+                             Immediate(FmulPostFactor[instr.fmul.postfactor]));
         }
 
+        // TODO(Rodrigo): Should precise be used when there's a postfactor?
+        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 21366869d..2fe787d6f 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -293,44 +293,66 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
 
 void ShaderIR::WriteLop3Instruction(NodeBlock& bb, Register dest, Node op_a, Node op_b, Node op_c,
                                     Node imm_lut, bool sets_cc) {
-    constexpr u32 lop_iterations = 32;
-    const Node one = Immediate(1);
-    const Node two = Immediate(2);
-
-    Node value;
-    for (u32 i = 0; i < lop_iterations; ++i) {
-        const Node shift_amount = Immediate(i);
-
-        const Node a = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_c, shift_amount);
-        const Node pack_0 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, one);
-
-        const Node b = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_b, shift_amount);
-        const Node c = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, b, one);
-        const Node pack_1 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, c, one);
-
-        const Node d = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_a, shift_amount);
-        const Node e = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, d, one);
-        const Node pack_2 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, e, two);
-
-        const Node pack_01 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_0, pack_1);
-        const Node pack_012 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_01, pack_2);
-
-        const Node shifted_bit =
-            Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, imm_lut, pack_012);
-        const Node bit = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, shifted_bit, one);
-
-        const Node right =
-            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, bit, shift_amount);
-
-        if (i > 0) {
-            value = Operation(OperationCode::IBitwiseOr, NO_PRECISE, value, right);
-        } else {
-            value = right;
+    const Node lop3_fast = [&](const Node na, const Node nb, const Node nc, const Node ttbl) {
+        Node value = Immediate(0);
+        const ImmediateNode imm = std::get<ImmediateNode>(*ttbl);
+        if (imm.GetValue() & 0x01) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
         }
-    }
+        if (imm.GetValue() & 0x02) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x04) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x08) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x10) {
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x20) {
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x40) {
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x80) {
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        return value;
+    }(op_a, op_b, op_c, imm_lut);
 
-    SetInternalFlagsFromInteger(bb, value, sets_cc);
-    SetRegister(bb, dest, value);
+    SetInternalFlagsFromInteger(bb, lop3_fast, sets_cc);
+    SetRegister(bb, dest, lop3_fast);
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp
index e02bcd097..8e3b46e8e 100644
--- a/src/video_core/shader/decode/bfe.cpp
+++ b/src/video_core/shader/decode/bfe.cpp
@@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.bfe.negate_b);
-
     Node op_a = GetRegister(instr.gpr8);
-    op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false);
-
-    switch (opcode->get().GetId()) {
-    case OpCode::Id::BFE_IMM: {
-        UNIMPLEMENTED_IF_MSG(instr.generates_cc,
-                             "Condition codes generation in BFE is not implemented");
+    Node op_b = [&] {
+        switch (opcode->get().GetId()) {
+        case OpCode::Id::BFE_R:
+            return GetRegister(instr.gpr20);
+        case OpCode::Id::BFE_C:
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+        case OpCode::Id::BFE_IMM:
+            return Immediate(instr.alu.GetSignedImm20_20());
+        default:
+            UNREACHABLE();
+            return Immediate(0);
+        }
+    }();
 
-        const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue()));
-        const Node outer_shift_imm =
-            Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position));
+    UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented");
 
-        const Node inner_shift =
-            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm);
-        const Node outer_shift =
-            Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm);
+    const bool is_signed = instr.bfe.is_signed;
 
-        SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, outer_shift);
-        break;
-    }
-    default:
-        UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName());
+    // using reverse parallel method in
+    // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
+    // note for later if possible to implement faster method.
+    if (instr.bfe.brev) {
+        const auto swap = [&](u32 s, u32 mask) {
+            Node v1 =
+                SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s));
+            if (mask != 0) {
+                v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1),
+                                     Immediate(mask));
+            }
+            Node v2 = op_a;
+            if (mask != 0) {
+                v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2),
+                                     Immediate(mask));
+            }
+            v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2),
+                                 Immediate(s));
+            return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1),
+                                   std::move(v2));
+        };
+        op_a = swap(1, 0x55555555U);
+        op_a = swap(2, 0x33333333U);
+        op_a = swap(4, 0x0F0F0F0FU);
+        op_a = swap(8, 0x00FF00FFU);
+        op_a = swap(16, 0);
     }
 
+    const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
+                                        Immediate(0), Immediate(8));
+    const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
+                                      Immediate(8), Immediate(8));
+    auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits);
+    SetRegister(bb, instr.gpr0, std::move(result));
+
     return pc;
 }
 
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index bee7d8cad..48350e042 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -12,6 +12,7 @@
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/node_helper.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -359,8 +360,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample
     if (sampler_info) {
         return *sampler_info;
     }
-    const auto sampler =
-        buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset);
+    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
+                                : registry.ObtainBoundSampler(offset);
     if (!sampler) {
         LOG_WARNING(HW_GPU, "Unknown sampler info");
         return SamplerInfo{TextureType::Texture2D, false, false, false};
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index 206961909..6191ffba1 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -12,6 +12,7 @@ namespace VideoCommon::Shader {
 
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
@@ -30,7 +31,7 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
     const bool is_signed_b = instr.xmad.sign_b == 1;
     const bool is_signed_c = is_signed_a;
 
-    auto [is_merge, is_psl, is_high_b, mode, op_b,
+    auto [is_merge, is_psl, is_high_b, mode, op_b_binding,
           op_c] = [&]() -> std::tuple<bool, bool, bool, Tegra::Shader::XmadMode, Node, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::XMAD_CR:
@@ -63,15 +64,19 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
         }
     }();
 
-    op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16);
+    op_a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(op_a),
+                           instr.xmad.high_a ? Immediate(16) : Immediate(0), Immediate(16));
 
-    const Node original_b = op_b;
-    op_b = BitfieldExtract(op_b, is_high_b ? 16 : 0, 16);
+    const Node original_b = op_b_binding;
+    const Node op_b =
+        SignedOperation(OperationCode::IBitfieldExtract, is_signed_b, std::move(op_b_binding),
+                        is_high_b ? Immediate(16) : Immediate(0), Immediate(16));
 
-    // TODO(Rodrigo): Use an appropiate sign for this operation
-    Node product = Operation(OperationCode::IMul, NO_PRECISE, op_a, op_b);
+    // we already check sign_a and sign_b is difference or not before so just use one in here.
+    Node product = SignedOperation(OperationCode::IMul, is_signed_a, op_a, op_b);
     if (is_psl) {
-        product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));
+        product =
+            SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_a, product, Immediate(16));
     }
     SetTemporary(bb, 0, product);
     product = GetTemporary(0);
@@ -88,12 +93,40 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
             return BitfieldExtract(original_c, 16, 16);
         case Tegra::Shader::XmadMode::CBcc: {
             const Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b,
-                                                   NO_PRECISE, original_b, Immediate(16));
-            return SignedOperation(OperationCode::IAdd, is_signed_c, NO_PRECISE, original_c,
-                                   shifted_b);
+                                                   original_b, Immediate(16));
+            return SignedOperation(OperationCode::IAdd, is_signed_c, original_c, shifted_b);
+        }
+        case Tegra::Shader::XmadMode::CSfu: {
+            const Node comp_a = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_a,
+                                                              op_a, Immediate(0));
+            const Node comp_b = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_b,
+                                                              op_b, Immediate(0));
+            const Node comp = Operation(OperationCode::LogicalOr, comp_a, comp_b);
+
+            const Node comp_minus_a = GetPredicateComparisonInteger(
+                PredCondition::NotEqual, is_signed_a,
+                SignedOperation(OperationCode::IBitwiseAnd, is_signed_a, op_a,
+                                Immediate(0x80000000)),
+                Immediate(0));
+            const Node comp_minus_b = GetPredicateComparisonInteger(
+                PredCondition::NotEqual, is_signed_b,
+                SignedOperation(OperationCode::IBitwiseAnd, is_signed_b, op_b,
+                                Immediate(0x80000000)),
+                Immediate(0));
+
+            Node new_c = Operation(
+                OperationCode::Select, comp_minus_a,
+                SignedOperation(OperationCode::IAdd, is_signed_c, original_c, Immediate(-65536)),
+                original_c);
+            new_c = Operation(
+                OperationCode::Select, comp_minus_b,
+                SignedOperation(OperationCode::IAdd, is_signed_c, new_c, Immediate(-65536)),
+                std::move(new_c));
+
+            return Operation(OperationCode::Select, comp, original_c, std::move(new_c));
         }
         default:
-            UNIMPLEMENTED_MSG("Unhandled XMAD mode: {}", static_cast<u32>(instr.xmad.mode.Value()));
+            UNREACHABLE();
             return Immediate(0);
         }
     }();
@@ -102,18 +135,19 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
     op_c = GetTemporary(1);
 
     // TODO(Rodrigo): Use an appropiate sign for this operation
-    Node sum = Operation(OperationCode::IAdd, product, op_c);
+    Node sum = SignedOperation(OperationCode::IAdd, is_signed_a, product, std::move(op_c));
     SetTemporary(bb, 2, sum);
     sum = GetTemporary(2);
     if (is_merge) {
-        const Node a = BitfieldExtract(sum, 0, 16);
-        const Node b =
-            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, original_b, Immediate(16));
-        sum = Operation(OperationCode::IBitwiseOr, NO_PRECISE, a, b);
+        const Node a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(sum),
+                                       Immediate(0), Immediate(16));
+        const Node b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, original_b,
+                                       Immediate(16));
+        sum = SignedOperation(OperationCode::IBitwiseOr, is_signed_a, a, b);
     }
 
     SetInternalFlagsFromInteger(bb, sum, instr.generates_cc);
-    SetRegister(bb, instr.gpr0, sum);
+    SetRegister(bb, instr.gpr0, std::move(sum));
 
     return pc;
 }
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index a0a7b9111..a1828546e 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -299,7 +299,7 @@ private:
     u32 index{};  ///< Emulated index given for the this sampler.
     u32 offset{}; ///< Offset in the const buffer from where the sampler is being read.
     u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size{};   ///< Size of the sampler if indexed.
+    u32 size{1};  ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
     bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp
index b3dcd291c..76c56abb5 100644
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed)
         return OperationCode::UBitwiseXor;
     case OperationCode::IBitwiseNot:
         return OperationCode::UBitwiseNot;
+    case OperationCode::IBitfieldExtract:
+        return OperationCode::UBitfieldExtract;
     case OperationCode::IBitfieldInsert:
         return OperationCode::UBitfieldInsert;
     case OperationCode::IBitCount:
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
new file mode 100644
index 000000000..af70b3f35
--- /dev/null
+++ b/src/video_core/shader/registry.cpp
@@ -0,0 +1,161 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <tuple>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Engines::ConstBufferEngineInterface;
+using Tegra::Engines::SamplerDescriptor;
+using Tegra::Engines::ShaderType;
+
+namespace {
+
+GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
+    if (shader_stage == ShaderType::Compute) {
+        return {};
+    }
+    auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine);
+
+    GraphicsInfo info;
+    info.tfb_layouts = graphics.regs.tfb_layouts;
+    info.tfb_varying_locs = graphics.regs.tfb_varying_locs;
+    info.primitive_topology = graphics.regs.draw.topology;
+    info.tessellation_primitive = graphics.regs.tess_mode.prim;
+    info.tessellation_spacing = graphics.regs.tess_mode.spacing;
+    info.tfb_enabled = graphics.regs.tfb_enabled;
+    info.tessellation_clockwise = graphics.regs.tess_mode.cw;
+    return info;
+}
+
+ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
+    if (shader_stage != ShaderType::Compute) {
+        return {};
+    }
+    auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine);
+    const auto& launch = compute.launch_description;
+
+    ComputeInfo info;
+    info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z};
+    info.local_memory_size_in_words = launch.local_pos_alloc;
+    info.shared_memory_size_in_words = launch.shared_alloc;
+    return info;
+}
+
+} // Anonymous namespace
+
+Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info)
+    : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile},
+      bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {}
+
+Registry::Registry(Tegra::Engines::ShaderType shader_stage,
+                   Tegra::Engines::ConstBufferEngineInterface& engine)
+    : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()},
+      graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo(
+                                                                 shader_stage, engine)} {}
+
+Registry::~Registry() = default;
+
+std::optional<u32> Registry::ObtainKey(u32 buffer, u32 offset) {
+    const std::pair<u32, u32> key = {buffer, offset};
+    const auto iter = keys.find(key);
+    if (iter != keys.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
+    keys.emplace(key, value);
+    return value;
+}
+
+std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
+    const u32 key = offset;
+    const auto iter = bound_samplers.find(key);
+    if (iter != bound_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
+    bound_samplers.emplace(key, value);
+    return value;
+}
+
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
+                                                                                 u32 offset) {
+    const std::pair key = {buffer, offset};
+    const auto iter = bindless_samplers.find(key);
+    if (iter != bindless_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
+    bindless_samplers.emplace(key, value);
+    return value;
+}
+
+void Registry::InsertKey(u32 buffer, u32 offset, u32 value) {
+    keys.insert_or_assign({buffer, offset}, value);
+}
+
+void Registry::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
+    bound_samplers.insert_or_assign(offset, sampler);
+}
+
+void Registry::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
+    bindless_samplers.insert_or_assign({buffer, offset}, sampler);
+}
+
+bool Registry::IsConsistent() const {
+    if (!engine) {
+        return true;
+    }
+    return std::all_of(keys.begin(), keys.end(),
+                       [this](const auto& pair) {
+                           const auto [cbuf, offset] = pair.first;
+                           const auto value = pair.second;
+                           return value == engine->AccessConstBuffer32(stage, cbuf, offset);
+                       }) &&
+           std::all_of(bound_samplers.begin(), bound_samplers.end(),
+                       [this](const auto& sampler) {
+                           const auto [key, value] = sampler;
+                           return value == engine->AccessBoundSampler(stage, key);
+                       }) &&
+           std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
+                       [this](const auto& sampler) {
+                           const auto [cbuf, offset] = sampler.first;
+                           const auto value = sampler.second;
+                           return value == engine->AccessBindlessSampler(stage, cbuf, offset);
+                       });
+}
+
+bool Registry::HasEqualKeys(const Registry& rhs) const {
+    return std::tie(keys, bound_samplers, bindless_samplers) ==
+           std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
+}
+
+const GraphicsInfo& Registry::GetGraphicsInfo() const {
+    ASSERT(stage != Tegra::Engines::ShaderType::Compute);
+    return graphics_info;
+}
+
+const ComputeInfo& Registry::GetComputeInfo() const {
+    ASSERT(stage == Tegra::Engines::ShaderType::Compute);
+    return compute_info;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
new file mode 100644
index 000000000..0c80d35fd
--- /dev/null
+++ b/src/video_core/shader/registry.h
@@ -0,0 +1,137 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include "common/common_types.h"
+#include "common/hash.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
+
+namespace VideoCommon::Shader {
+
+using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
+using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using BindlessSamplerMap =
+    std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
+
+struct GraphicsInfo {
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+    std::array<Maxwell::TransformFeedbackLayout, Maxwell::NumTransformFeedbackBuffers>
+        tfb_layouts{};
+    std::array<std::array<u8, 128>, Maxwell::NumTransformFeedbackBuffers> tfb_varying_locs{};
+    Maxwell::PrimitiveTopology primitive_topology{};
+    Maxwell::TessellationPrimitive tessellation_primitive{};
+    Maxwell::TessellationSpacing tessellation_spacing{};
+    bool tfb_enabled = false;
+    bool tessellation_clockwise = false;
+};
+static_assert(std::is_trivially_copyable_v<GraphicsInfo> &&
+              std::is_standard_layout_v<GraphicsInfo>);
+
+struct ComputeInfo {
+    std::array<u32, 3> workgroup_size{};
+    u32 shared_memory_size_in_words = 0;
+    u32 local_memory_size_in_words = 0;
+};
+static_assert(std::is_trivially_copyable_v<ComputeInfo> && std::is_standard_layout_v<ComputeInfo>);
+
+struct SerializedRegistryInfo {
+    VideoCore::GuestDriverProfile guest_driver_profile;
+    u32 bound_buffer = 0;
+    GraphicsInfo graphics;
+    ComputeInfo compute;
+};
+
+/**
+ * The Registry is a class use to interface the 3D and compute engines with the shader compiler.
+ * With it, the shader can obtain required data from GPU state and store it for disk shader
+ * compilation.
+ */
+class Registry {
+public:
+    explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info);
+
+    explicit Registry(Tegra::Engines::ShaderType shader_stage,
+                      Tegra::Engines::ConstBufferEngineInterface& engine);
+
+    ~Registry();
+
+    /// Retrieves a key from the registry, if it's registered, it will give the registered value, if
+    /// not it will obtain it from maxwell3d and register it.
+    std::optional<u32> ObtainKey(u32 buffer, u32 offset);
+
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
+
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
+
+    /// Inserts a key.
+    void InsertKey(u32 buffer, u32 offset, u32 value);
+
+    /// Inserts a bound sampler key.
+    void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
+
+    /// Inserts a bindless sampler key.
+    void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
+
+    /// Checks keys and samplers against engine's current const buffers.
+    /// Returns true if they are the same value, false otherwise.
+    bool IsConsistent() const;
+
+    /// Returns true if the keys are equal to the other ones in the registry.
+    bool HasEqualKeys(const Registry& rhs) const;
+
+    /// Returns graphics information from this shader
+    const GraphicsInfo& GetGraphicsInfo() const;
+
+    /// Returns compute information from this shader
+    const ComputeInfo& GetComputeInfo() const;
+
+    /// Gives an getter to the const buffer keys in the database.
+    const KeyMap& GetKeys() const {
+        return keys;
+    }
+
+    /// Gets samplers database.
+    const BoundSamplerMap& GetBoundSamplers() const {
+        return bound_samplers;
+    }
+
+    /// Gets bindless samplers database.
+    const BindlessSamplerMap& GetBindlessSamplers() const {
+        return bindless_samplers;
+    }
+
+    /// Gets bound buffer used on this shader
+    u32 GetBoundBuffer() const {
+        return bound_buffer;
+    }
+
+    /// Obtains access to the guest driver's profile.
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() {
+        return engine ? engine->AccessGuestDriverProfile() : stored_guest_driver_profile;
+    }
+
+private:
+    const Tegra::Engines::ShaderType stage;
+    VideoCore::GuestDriverProfile stored_guest_driver_profile;
+    Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
+    KeyMap keys;
+    BoundSamplerMap bound_samplers;
+    BindlessSamplerMap bindless_samplers;
+    u32 bound_buffer;
+    GraphicsInfo graphics_info;
+    ComputeInfo compute_info;
+};
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 3a5d280a9..baf7188d2 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -11,6 +11,7 @@
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/node_helper.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -24,8 +25,8 @@ using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
 
 ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
-                   ConstBufferLocker& locker)
-    : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} {
+                   Registry& registry)
+    : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} {
     Decode();
     PostDecode();
 }
@@ -95,6 +96,7 @@ Node ShaderIR::GetPredicate(bool immediate) {
 }
 
 Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    MarkAttributeUsage(index, element);
     used_input_attributes.emplace(index);
     return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }
@@ -105,42 +107,8 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres
 }
 
 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
-    if (index == Attribute::Index::LayerViewportPointSize) {
-        switch (element) {
-        case 0:
-            UNIMPLEMENTED();
-            break;
-        case 1:
-            uses_layer = true;
-            break;
-        case 2:
-            uses_viewport_index = true;
-            break;
-        case 3:
-            uses_point_size = true;
-            break;
-        }
-    }
-    if (index == Attribute::Index::TessCoordInstanceIDVertexID) {
-        switch (element) {
-        case 2:
-            uses_instance_id = true;
-            break;
-        case 3:
-            uses_vertex_id = true;
-            break;
-        default:
-            break;
-        }
-    }
-    if (index == Attribute::Index::ClipDistances0123 ||
-        index == Attribute::Index::ClipDistances4567) {
-        const auto clip_index =
-            static_cast<u32>((index == Attribute::Index::ClipDistances4567 ? 1 : 0) + element);
-        used_clip_distances.at(clip_index) = true;
-    }
+    MarkAttributeUsage(index, element);
     used_output_attributes.insert(index);
-
     return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }
 
@@ -451,6 +419,54 @@ Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
                      Immediate(bits));
 }
 
+void ShaderIR::MarkAttributeUsage(Attribute::Index index, u64 element) {
+    switch (index) {
+    case Attribute::Index::LayerViewportPointSize:
+        switch (element) {
+        case 0:
+            UNIMPLEMENTED();
+            break;
+        case 1:
+            uses_layer = true;
+            break;
+        case 2:
+            uses_viewport_index = true;
+            break;
+        case 3:
+            uses_point_size = true;
+            break;
+        }
+        break;
+    case Attribute::Index::TessCoordInstanceIDVertexID:
+        switch (element) {
+        case 2:
+            uses_instance_id = true;
+            break;
+        case 3:
+            uses_vertex_id = true;
+            break;
+        }
+        break;
+    case Attribute::Index::ClipDistances0123:
+    case Attribute::Index::ClipDistances4567: {
+        const u64 clip_index = (index == Attribute::Index::ClipDistances4567 ? 4 : 0) + element;
+        used_clip_distances.at(clip_index) = true;
+        break;
+    }
+    case Attribute::Index::FrontColor:
+    case Attribute::Index::FrontSecondaryColor:
+    case Attribute::Index::BackColor:
+    case Attribute::Index::BackSecondaryColor:
+        uses_legacy_varyings = true;
+        break;
+    default:
+        if (index >= Attribute::Index::TexCoord_0 && index <= Attribute::Index::TexCoord_7) {
+            uses_legacy_varyings = true;
+        }
+        break;
+    }
+}
+
 std::size_t ShaderIR::DeclareAmend(Node new_amend) {
     const std::size_t id = amend_code.size();
     amend_code.push_back(new_amend);
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index b0851c3be..80fc9b82c 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,8 +18,8 @@
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
-#include "video_core/shader/const_buffer_locker.h"
 #include "video_core/shader/node.h"
+#include "video_core/shader/registry.h"
 
 namespace VideoCommon::Shader {
 
@@ -69,7 +69,7 @@ struct GlobalMemoryUsage {
 class ShaderIR final {
 public:
     explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
-                      ConstBufferLocker& locker);
+                      Registry& registry);
     ~ShaderIR();
 
     const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@@ -137,6 +137,10 @@ public:
         return uses_vertex_id;
     }
 
+    bool UsesLegacyVaryings() const {
+        return uses_legacy_varyings;
+    }
+
     bool UsesWarps() const {
         return uses_warps;
     }
@@ -343,6 +347,9 @@ private:
     /// Inserts a sequence of bits from a node
     Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
 
+    /// Marks the usage of a input or output attribute.
+    void MarkAttributeUsage(Tegra::Shader::Attribute::Index index, u64 element);
+
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                   const Node4& components);
 
@@ -414,7 +421,7 @@ private:
     const ProgramCode& program_code;
     const u32 main_offset;
     const CompilerSettings settings;
-    ConstBufferLocker& locker;
+    Registry& registry;
 
     bool decompiled{};
     bool disable_flow_stack{};
@@ -443,6 +450,7 @@ private:
     bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes
     bool uses_instance_id{};
     bool uses_vertex_id{};
+    bool uses_legacy_varyings{};
     bool uses_warps{};
     bool uses_indexed_samplers{};
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index face8c943..10739b37d 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -81,26 +81,20 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
                 MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
             return {tracked, track};
         } else if (const auto operation = std::get_if<OperationNode>(&*offset)) {
-            auto bound_buffer = locker.ObtainBoundBuffer();
-            if (!bound_buffer) {
+            const u32 bound_buffer = registry.GetBoundBuffer();
+            if (bound_buffer != cbuf->GetIndex()) {
                 return {};
             }
-            if (*bound_buffer != cbuf->GetIndex()) {
-                return {};
-            }
-            auto pair = DecoupleIndirectRead(*operation);
+            const auto pair = DecoupleIndirectRead(*operation);
             if (!pair) {
                 return {};
             }
             auto [gpr, base_offset] = *pair;
             const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            auto gpu_driver = locker.AccessGuestDriverProfile();
-            if (gpu_driver == nullptr) {
-                return {};
-            }
+            const auto& gpu_driver = registry.AccessGuestDriverProfile();
             const u32 bindless_cv = NewCustomVariable();
-            const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr,
-                                      Immediate(gpu_driver->GetTextureHandlerSize()));
+            const Node op =
+                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
 
             const Node cv_node = GetCustomVariable(bindless_cv);
             Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
@@ -157,13 +151,21 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
             return {};
         }
-        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
-        // register that it uses as operand
-        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
-        if (!source) {
-            return {};
+        s64 current_cursor = cursor;
+        while (current_cursor > 0) {
+            // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+            // register that it uses as operand
+            const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1);
+            current_cursor = new_cursor;
+            if (!source) {
+                continue;
+            }
+            const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor);
+            if (base_address != nullptr) {
+                return {base_address, index, offset};
+            }
         }
-        return TrackCbuf(source, code, new_cursor);
+        return {};
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp
new file mode 100644
index 000000000..22a933761
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.cpp
@@ -0,0 +1,115 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/transform_feedback.h"
+
+namespace VideoCommon::Shader {
+
+namespace {
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20
+
+/// Attribute offsets that describe a vector
+constexpr std::array VECTORS = {
+    28,  // gl_Position
+    32,  // Generic 0
+    36,  // Generic 1
+    40,  // Generic 2
+    44,  // Generic 3
+    48,  // Generic 4
+    52,  // Generic 5
+    56,  // Generic 6
+    60,  // Generic 7
+    64,  // Generic 8
+    68,  // Generic 9
+    72,  // Generic 10
+    76,  // Generic 11
+    80,  // Generic 12
+    84,  // Generic 13
+    88,  // Generic 14
+    92,  // Generic 15
+    96,  // Generic 16
+    100, // Generic 17
+    104, // Generic 18
+    108, // Generic 19
+    112, // Generic 20
+    116, // Generic 21
+    120, // Generic 22
+    124, // Generic 23
+    128, // Generic 24
+    132, // Generic 25
+    136, // Generic 26
+    140, // Generic 27
+    144, // Generic 28
+    148, // Generic 29
+    152, // Generic 30
+    156, // Generic 31
+    160, // gl_FrontColor
+    164, // gl_FrontSecondaryColor
+    160, // gl_BackColor
+    164, // gl_BackSecondaryColor
+    192, // gl_TexCoord[0]
+    196, // gl_TexCoord[1]
+    200, // gl_TexCoord[2]
+    204, // gl_TexCoord[3]
+    208, // gl_TexCoord[4]
+    212, // gl_TexCoord[5]
+    216, // gl_TexCoord[6]
+    220, // gl_TexCoord[7]
+};
+} // namespace
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) {
+
+    std::unordered_map<u8, VaryingTFB> tfb;
+
+    for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) {
+        const auto& locations = info.tfb_varying_locs[buffer];
+        const auto& layout = info.tfb_layouts[buffer];
+        const std::size_t varying_count = layout.varying_count;
+
+        std::size_t highest = 0;
+
+        for (std::size_t offset = 0; offset < varying_count; ++offset) {
+            const std::size_t base_offset = offset;
+            const u8 location = locations[offset];
+
+            VaryingTFB varying;
+            varying.buffer = layout.stream;
+            varying.stride = layout.stride;
+            varying.offset = offset * sizeof(u32);
+            varying.components = 1;
+
+            if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) {
+                UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB");
+
+                const u8 base_index = location / 4;
+                while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) {
+                    ++offset;
+                    ++varying.components;
+                }
+            }
+
+            [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second;
+            UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored");
+
+            highest = std::max(highest, (base_offset + varying.components) * sizeof(u32));
+        }
+
+        UNIMPLEMENTED_IF(highest != layout.stride);
+    }
+    return tfb;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h
new file mode 100644
index 000000000..77d05f64c
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.h
@@ -0,0 +1,23 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "common/common_types.h"
+#include "video_core/shader/registry.h"
+
+namespace VideoCommon::Shader {
+
+struct VaryingTFB {
+    std::size_t buffer;
+    std::size_t stride;
+    std::size_t offset;
+    std::size_t components;
+};
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info);
+
+} // namespace VideoCommon::Shader