37 files changed, 2379 insertions, 962 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index aa5bc3bbe..f5ae57039 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -5,6 +5,8 @@ add_library(video_core STATIC
     debug_utils/debug_utils.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
+    engines/kepler_memory.cpp
+    engines/kepler_memory.h
     engines/maxwell_3d.cpp
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
@@ -12,6 +14,7 @@ add_library(video_core STATIC
     engines/maxwell_dma.cpp
     engines/maxwell_dma.h
     engines/shader_bytecode.h
+    engines/shader_header.h
     gpu.cpp
     gpu.h
     macro_interpreter.cpp
@@ -22,6 +25,8 @@ add_library(video_core STATIC
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_buffer_cache.cpp
+    renderer_opengl/gl_buffer_cache.h
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_rasterizer_cache.cpp
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index dc485e811..f1aa6091b 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -14,6 +14,7 @@
 #include "core/tracer/recorder.h"
 #include "video_core/command_processor.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -28,98 +29,109 @@ enum class BufferMethods {
     CountBufferMethods = 0x40,
 };
 
-void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
-    LOG_TRACE(HW_GPU,
-              "Processing method {:08X} on subchannel {} value "
-              "{:08X} remaining params {}",
-              method, subchannel, value, remaining_params);
-
-    if (method == static_cast<u32>(BufferMethods::BindObject)) {
-        // Bind the current subchannel to the desired engine id.
-        LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value);
-        bound_engines[subchannel] = static_cast<EngineID>(value);
-        return;
-    }
+MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192));
 
-    if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) {
-        // TODO(Subv): Research and implement these methods.
-        LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented");
-        return;
-    }
+void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) {
+    MICROPROFILE_SCOPE(ProcessCommandLists);
 
-    ASSERT(bound_engines.find(subchannel) != bound_engines.end());
-
-    const EngineID engine = bound_engines[subchannel];
-
-    switch (engine) {
-    case EngineID::FERMI_TWOD_A:
-        fermi_2d->WriteReg(method, value);
-        break;
-    case EngineID::MAXWELL_B:
-        maxwell_3d->WriteReg(method, value, remaining_params);
-        break;
-    case EngineID::MAXWELL_COMPUTE_B:
-        maxwell_compute->WriteReg(method, value);
-        break;
-    case EngineID::MAXWELL_DMA_COPY_A:
-        maxwell_dma->WriteReg(method, value);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented engine");
-    }
-}
+    auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) {
+        LOG_TRACE(HW_GPU,
+                  "Processing method {:08X} on subchannel {} value "
+                  "{:08X} remaining params {}",
+                  method, subchannel, value, remaining_params);
 
-void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
-    const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address);
-    VAddr current_addr = *head_address;
-    while (current_addr < *head_address + size * sizeof(CommandHeader)) {
-        const CommandHeader header = {Memory::Read32(current_addr)};
-        current_addr += sizeof(u32);
-
-        switch (header.mode.Value()) {
-        case SubmissionMode::IncreasingOld:
-        case SubmissionMode::Increasing: {
-            // Increase the method value with each argument.
-            for (unsigned i = 0; i < header.arg_count; ++i) {
-                WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr),
-                         header.arg_count - i - 1);
-                current_addr += sizeof(u32);
-            }
-            break;
+        ASSERT(subchannel < bound_engines.size());
+
+        if (method == static_cast<u32>(BufferMethods::BindObject)) {
+            // Bind the current subchannel to the desired engine id.
+            LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value);
+            bound_engines[subchannel] = static_cast<EngineID>(value);
+            return;
         }
-        case SubmissionMode::NonIncreasingOld:
-        case SubmissionMode::NonIncreasing: {
-            // Use the same method value for all arguments.
-            for (unsigned i = 0; i < header.arg_count; ++i) {
-                WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
-                         header.arg_count - i - 1);
-                current_addr += sizeof(u32);
-            }
+
+        if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) {
+            // TODO(Subv): Research and implement these methods.
+            LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented");
+            return;
+        }
+
+        const EngineID engine = bound_engines[subchannel];
+
+        switch (engine) {
+        case EngineID::FERMI_TWOD_A:
+            fermi_2d->WriteReg(method, value);
+            break;
+        case EngineID::MAXWELL_B:
+            maxwell_3d->WriteReg(method, value, remaining_params);
             break;
+        case EngineID::MAXWELL_COMPUTE_B:
+            maxwell_compute->WriteReg(method, value);
+            break;
+        case EngineID::MAXWELL_DMA_COPY_A:
+            maxwell_dma->WriteReg(method, value);
+            break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            kepler_memory->WriteReg(method, value);
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Unimplemented engine");
         }
-        case SubmissionMode::IncreaseOnce: {
-            ASSERT(header.arg_count.Value() >= 1);
+    };
 
-            // Use the original method for the first argument and then the next method for all other
-            // arguments.
-            WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
-                     header.arg_count - 1);
+    for (auto entry : commands) {
+        Tegra::GPUVAddr address = entry.Address();
+        u32 size = entry.sz;
+        const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address);
+        VAddr current_addr = *head_address;
+        while (current_addr < *head_address + size * sizeof(CommandHeader)) {
+            const CommandHeader header = {Memory::Read32(current_addr)};
             current_addr += sizeof(u32);
 
-            for (unsigned i = 1; i < header.arg_count; ++i) {
-                WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr),
-                         header.arg_count - i - 1);
+            switch (header.mode.Value()) {
+            case SubmissionMode::IncreasingOld:
+            case SubmissionMode::Increasing: {
+                // Increase the method value with each argument.
+                for (unsigned i = 0; i < header.arg_count; ++i) {
+                    WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr),
+                             header.arg_count - i - 1);
+                    current_addr += sizeof(u32);
+                }
+                break;
+            }
+            case SubmissionMode::NonIncreasingOld:
+            case SubmissionMode::NonIncreasing: {
+                // Use the same method value for all arguments.
+                for (unsigned i = 0; i < header.arg_count; ++i) {
+                    WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
+                             header.arg_count - i - 1);
+                    current_addr += sizeof(u32);
+                }
+                break;
+            }
+            case SubmissionMode::IncreaseOnce: {
+                ASSERT(header.arg_count.Value() >= 1);
+
+                // Use the original method for the first argument and then the next method for all
+                // other arguments.
+                WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
+                         header.arg_count - 1);
                 current_addr += sizeof(u32);
+
+                for (unsigned i = 1; i < header.arg_count; ++i) {
+                    WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr),
+                             header.arg_count - i - 1);
+                    current_addr += sizeof(u32);
+                }
+                break;
+            }
+            case SubmissionMode::Inline: {
+                // The register value is stored in the bits 16-28 as an immediate
+                WriteReg(header.method, header.subchannel, header.inline_data, 0);
+                break;
+            }
+            default:
+                UNIMPLEMENTED();
             }
-            break;
-        }
-        case SubmissionMode::Inline: {
-            // The register value is stored in the bits 16-28 as an immediate
-            WriteReg(header.method, header.subchannel, header.inline_data, 0);
-            break;
-        }
-        default:
-            UNIMPLEMENTED();
         }
     }
 }
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
index a01153e0b..bd766e77a 100644
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -7,6 +7,7 @@
 #include <type_traits>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/memory_manager.h"
 
 namespace Tegra {
 
@@ -19,6 +20,22 @@ enum class SubmissionMode : u32 {
     IncreaseOnce = 5
 };
 
+struct CommandListHeader {
+    u32 entry0; // gpu_va_lo
+    union {
+        u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F)
+        BitField<0, 8, u32> gpu_va_hi;
+        BitField<8, 2, u32> unk1;
+        BitField<10, 21, u32> sz;
+        BitField<31, 1, u32> unk2;
+    };
+
+    GPUVAddr Address() const {
+        return (static_cast<GPUVAddr>(gpu_va_hi) << 32) | entry0;
+    }
+};
+static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size");
+
 union CommandHeader {
     u32 hex;
 
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index dcf9ef8b9..021b83eaa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -26,7 +26,7 @@ public:
     void WriteReg(u32 method, u32 value);
 
     struct Regs {
-        static constexpr size_t NUM_REGS = 0x258;
+        static constexpr std::size_t NUM_REGS = 0x258;
 
         struct Surface {
             RenderTargetFormat format;
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
new file mode 100644
index 000000000..66ae6332d
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -0,0 +1,45 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "core/memory.h"
+#include "video_core/engines/kepler_memory.h"
+
+namespace Tegra::Engines {
+
+KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+KeplerMemory::~KeplerMemory() = default;
+
+void KeplerMemory::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerMemory register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+    switch (method) {
+    case KEPLERMEMORY_REG_INDEX(exec): {
+        state.write_offset = 0;
+        break;
+    }
+    case KEPLERMEMORY_REG_INDEX(data): {
+        ProcessData(value);
+        break;
+    }
+    }
+}
+
+void KeplerMemory::ProcessData(u32 data) {
+    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
+    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
+
+    GPUVAddr address = regs.dest.Address();
+    VAddr dest_address =
+        *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32));
+
+    Memory::Write32(dest_address, data);
+
+    state.write_offset++;
+}
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
new file mode 100644
index 000000000..b0d0078cf
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.h
@@ -0,0 +1,90 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Engines {
+
+#define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
+    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
+
+class KeplerMemory final {
+public:
+    KeplerMemory(MemoryManager& memory_manager);
+    ~KeplerMemory();
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x7F;
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0x60);
+
+                u32 line_length_in;
+                u32 line_count;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 pitch;
+                    u32 block_dimensions;
+                    u32 width;
+                    u32 height;
+                    u32 depth;
+                    u32 z;
+                    u32 x;
+                    u32 y;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dest;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec;
+
+                u32 data;
+
+                INSERT_PADDING_WORDS(0x11);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    struct {
+        u32 write_offset = 0;
+    } state{};
+
+private:
+    MemoryManager& memory_manager;
+
+    void ProcessData(u32 data);
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(line_length_in, 0x60);
+ASSERT_REG_POSITION(line_count, 0x61);
+ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(exec, 0x6C);
+ASSERT_REG_POSITION(data, 0x6D);
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 68ff1e86b..8afd26fe9 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -5,6 +5,7 @@
 #include <cinttypes>
 #include "common/assert.h"
 #include "core/core.h"
+#include "core/core_timing.h"
 #include "core/memory.h"
 #include "video_core/debug_utils/debug_utils.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -134,8 +135,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
         break;
     }
 
-    rasterizer.NotifyMaxwellRegisterChanged(method);
-
     if (debug_context) {
         debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr);
     }
@@ -194,8 +193,8 @@ void Maxwell3D::ProcessQueryGet() {
             // wait queues.
             LongQueryResult query_result{};
             query_result.value = result;
-            // TODO(Subv): Generate a real GPU timestamp and write it here instead of 0
-            query_result.timestamp = 0;
+            // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming
+            query_result.timestamp = CoreTiming::GetTicks();
             Memory::WriteBlock(*address, &query_result, sizeof(query_result));
         }
         break;
@@ -249,8 +248,8 @@ void Maxwell3D::DrawArrays() {
 
 void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
-    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
-    auto& bind_data = regs.cb_bind[static_cast<size_t>(stage)];
+    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)];
 
     auto& buffer = shader.const_buffers[bind_data.index];
 
@@ -292,10 +291,6 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
                    tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
                "TIC versions other than BlockLinear or Pitch are unimplemented");
 
-    ASSERT_MSG((tic_entry.texture_type == Texture::TextureType::Texture2D) ||
-                   (tic_entry.texture_type == Texture::TextureType::Texture2DNoMipmap),
-               "Texture types other than Texture2D are unimplemented");
-
     auto r_type = tic_entry.r_type.Value();
     auto g_type = tic_entry.g_type.Value();
     auto b_type = tic_entry.b_type.Value();
@@ -321,14 +316,14 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
 std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderStage stage) const {
     std::vector<Texture::FullTextureInfo> textures;
 
-    auto& fragment_shader = state.shader_stages[static_cast<size_t>(stage)];
+    auto& fragment_shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& tex_info_buffer = fragment_shader.const_buffers[regs.tex_cb_index];
     ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
 
     GPUVAddr tex_info_buffer_end = tex_info_buffer.address + tex_info_buffer.size;
 
     // Offset into the texture constbuffer where the texture info begins.
-    static constexpr size_t TextureInfoOffset = 0x20;
+    static constexpr std::size_t TextureInfoOffset = 0x20;
 
     for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset;
          current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) {
@@ -365,8 +360,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
     return textures;
 }
 
-Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, size_t offset) const {
-    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
+Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
+                                                    std::size_t offset) const {
+    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
     ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 92bfda053..b81b0723d 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -34,17 +34,17 @@ public:
     /// Register structure of the Maxwell3D engine.
     /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
     struct Regs {
-        static constexpr size_t NUM_REGS = 0xE00;
-
-        static constexpr size_t NumRenderTargets = 8;
-        static constexpr size_t NumViewports = 16;
-        static constexpr size_t NumCBData = 16;
-        static constexpr size_t NumVertexArrays = 32;
-        static constexpr size_t NumVertexAttributes = 32;
-        static constexpr size_t MaxShaderProgram = 6;
-        static constexpr size_t MaxShaderStage = 5;
+        static constexpr std::size_t NUM_REGS = 0xE00;
+
+        static constexpr std::size_t NumRenderTargets = 8;
+        static constexpr std::size_t NumViewports = 16;
+        static constexpr std::size_t NumCBData = 16;
+        static constexpr std::size_t NumVertexArrays = 32;
+        static constexpr std::size_t NumVertexAttributes = 32;
+        static constexpr std::size_t MaxShaderProgram = 6;
+        static constexpr std::size_t MaxShaderStage = 5;
         // Maximum number of const buffers per shader stage.
-        static constexpr size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBuffers = 18;
 
         enum class QueryMode : u32 {
             Write = 0,
@@ -127,6 +127,7 @@ public:
                 BitField<21, 6, Size> size;
                 BitField<27, 3, Type> type;
                 BitField<31, 1, u32> bgra;
+                u32 hex;
             };
 
             u32 ComponentCount() const {
@@ -262,6 +263,10 @@ public:
             bool IsValid() const {
                 return size != Size::Invalid;
             }
+
+            bool operator<(const VertexAttribute& other) const {
+                return hex < other.hex;
+            }
         };
 
         enum class PrimitiveTopology : u32 {
@@ -438,9 +443,9 @@ public:
             }
         };
 
-        bool IsShaderConfigEnabled(size_t index) const {
+        bool IsShaderConfigEnabled(std::size_t index) const {
             // The VertexB is always enabled.
-            if (index == static_cast<size_t>(Regs::ShaderProgram::VertexB)) {
+            if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) {
                 return true;
             }
             return shader_config[index].enable != 0;
@@ -528,7 +533,11 @@ public:
                 u32 stencil_back_mask;
                 u32 stencil_back_func_mask;
 
-                INSERT_PADDING_WORDS(0x20);
+                INSERT_PADDING_WORDS(0x13);
+
+                u32 rt_separate_frag_data;
+
+                INSERT_PADDING_WORDS(0xC);
 
                 struct {
                     u32 address_high;
@@ -545,14 +554,29 @@ public:
 
                 INSERT_PADDING_WORDS(0x5B);
 
-                VertexAttribute vertex_attrib_format[NumVertexAttributes];
+                std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
 
                 INSERT_PADDING_WORDS(0xF);
 
                 struct {
                     union {
                         BitField<0, 4, u32> count;
+                        BitField<4, 3, u32> map_0;
+                        BitField<7, 3, u32> map_1;
+                        BitField<10, 3, u32> map_2;
+                        BitField<13, 3, u32> map_3;
+                        BitField<16, 3, u32> map_4;
+                        BitField<19, 3, u32> map_5;
+                        BitField<22, 3, u32> map_6;
+                        BitField<25, 3, u32> map_7;
                     };
+
+                    u32 GetMap(std::size_t index) const {
+                        const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3,
+                                                                     map_4, map_5, map_6, map_7};
+                        ASSERT(index < maps.size());
+                        return maps[index];
+                    }
                 } rt_control;
 
                 INSERT_PADDING_WORDS(0x2);
@@ -901,7 +925,7 @@ public:
     std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
 
     /// Returns the texture information for a specific texture in a specific shader stage.
-    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, size_t offset) const;
+    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;
 
 private:
     VideoCore::RasterizerInterface& rasterizer;
@@ -963,8 +987,9 @@ ASSERT_REG_POSITION(clear_stencil, 0x368);
 ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
 ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
 ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
+ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
 ASSERT_REG_POSITION(zeta, 0x3F8);
-ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458);
+ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 6e740713f..aa7481b8c 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -41,7 +41,6 @@ void MaxwellDMA::HandleCopy() {
 
     // TODO(Subv): Perform more research and implement all features of this engine.
     ASSERT(regs.exec.enable_swizzle == 0);
-    ASSERT(regs.exec.enable_2d == 1);
     ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
     ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
     ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
@@ -51,10 +50,19 @@ void MaxwellDMA::HandleCopy() {
     ASSERT(regs.dst_params.pos_y == 0);
 
     if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
-        Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count * regs.y_count);
+        std::size_t copy_size = regs.x_count;
+
+        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
+        // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
+        if (regs.exec.enable_2d) {
+            copy_size = copy_size * regs.y_count;
+        }
+
+        Memory::CopyBlock(dest_cpu, source_cpu, copy_size);
         return;
     }
 
+    ASSERT(regs.exec.enable_2d == 1);
     u8* src_buffer = Memory::GetPointer(source_cpu);
     u8* dst_buffer = Memory::GetPointer(dest_cpu);
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 7882f16e0..311ccb616 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -23,7 +23,7 @@ public:
     void WriteReg(u32 method, u32 value);
 
     struct Regs {
-        static constexpr size_t NUM_REGS = 0x1D6;
+        static constexpr std::size_t NUM_REGS = 0x1D6;
 
         struct Parameters {
             union {
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 3e4efbe0c..7e1de0fa1 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -20,10 +20,10 @@ namespace Tegra::Shader {
 
 struct Register {
     /// Number of registers
-    static constexpr size_t NumRegisters = 256;
+    static constexpr std::size_t NumRegisters = 256;
 
     /// Register 255 is special cased to always be 0
-    static constexpr size_t ZeroIndex = 255;
+    static constexpr std::size_t ZeroIndex = 255;
 
     enum class Size : u64 {
         Byte = 0,
@@ -67,6 +67,13 @@ private:
     u64 value{};
 };
 
+enum class AttributeSize : u64 {
+    Word = 0,
+    DoubleWord = 1,
+    TripleWord = 2,
+    QuadWord = 3,
+};
+
 union Attribute {
     Attribute() = default;
 
@@ -76,6 +83,7 @@ union Attribute {
         Position = 7,
         Attribute_0 = 8,
         Attribute_31 = 39,
+        PointCoord = 46,
         // This attribute contains a tuple of (~, ~, InstanceId, VertexId) when inside a vertex
         // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval
         // shader.
@@ -86,9 +94,10 @@ union Attribute {
     };
 
     union {
+        BitField<20, 10, u64> immediate;
         BitField<22, 2, u64> element;
         BitField<24, 6, Index> index;
-        BitField<47, 3, u64> size;
+        BitField<47, 3, AttributeSize> size;
     } fmt20;
 
     union {
@@ -231,6 +240,41 @@ enum class FlowCondition : u64 {
     Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for?
 };
 
+enum class ControlCode : u64 {
+    F = 0,
+    LT = 1,
+    EQ = 2,
+    LE = 3,
+    GT = 4,
+    NE = 5,
+    GE = 6,
+    Num = 7,
+    Nan = 8,
+    LTU = 9,
+    EQU = 10,
+    LEU = 11,
+    GTU = 12,
+    NEU = 13,
+    GEU = 14,
+    //
+    OFF = 16,
+    LO = 17,
+    SFF = 18,
+    LS = 19,
+    HI = 20,
+    SFT = 21,
+    HS = 22,
+    OFT = 23,
+    CSM_TA = 24,
+    CSM_TR = 25,
+    CSM_MX = 26,
+    FCSM_TA = 27,
+    FCSM_TR = 28,
+    FCSM_MX = 29,
+    RLE = 30,
+    RGT = 31,
+};
+
 enum class PredicateResultMode : u64 {
     None = 0x0,
     NotZero = 0x3,
@@ -243,7 +287,47 @@ enum class TextureType : u64 {
     TextureCube = 3,
 };
 
-enum class IpaMode : u64 { Pass = 0, None = 1, Constant = 2, Sc = 3 };
+enum class TextureQueryType : u64 {
+    Dimension = 1,
+    TextureType = 2,
+    SamplePosition = 5,
+    Filter = 16,
+    LevelOfDetail = 18,
+    Wrap = 20,
+    BorderColor = 22,
+};
+
+enum class TextureProcessMode : u64 {
+    None = 0,
+    LZ = 1,  // Unknown, appears to be the same as none.
+    LB = 2,  // Load Bias.
+    LL = 3,  // Load LOD (LevelOfDetail)
+    LBA = 6, // Load Bias. The A is unknown, does not appear to differ with LB
+    LLA = 7  // Load LOD. The A is unknown, does not appear to differ with LL
+};
+
+enum class TextureMiscMode : u64 {
+    DC,
+    AOFFI, // Uses Offset
+    NDV,
+    NODEP,
+    MZ,
+    PTP,
+};
+
+enum class IpaInterpMode : u64 { Linear = 0, Perspective = 1, Flat = 2, Sc = 3 };
+enum class IpaSampleMode : u64 { Default = 0, Centroid = 1, Offset = 2 };
+
+struct IpaMode {
+    IpaInterpMode interpolation_mode;
+    IpaSampleMode sampling_mode;
+    inline bool operator==(const IpaMode& a) {
+        return (a.interpolation_mode == interpolation_mode) && (a.sampling_mode == sampling_mode);
+    }
+    inline bool operator!=(const IpaMode& a) {
+        return !((*this) == a);
+    }
+};
 
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
@@ -328,10 +412,16 @@ union Instruction {
     } alu;
 
     union {
-        BitField<54, 3, IpaMode> mode;
+        BitField<51, 1, u64> saturate;
+        BitField<52, 2, IpaSampleMode> sample_mode;
+        BitField<54, 2, IpaInterpMode> interp_mode;
     } ipa;
 
     union {
+        BitField<39, 2, u64> tab5cb8_2;
+        BitField<41, 3, u64> tab5c68_1;
+        BitField<44, 2, u64> tab5c68_0;
+        BitField<47, 1, u64> cc;
         BitField<48, 1, u64> negate_b;
     } fmul;
 
@@ -395,12 +485,54 @@ union Instruction {
     } bfe;
 
     union {
+        BitField<48, 3, u64> pred48;
+
+        union {
+            BitField<20, 20, u64> entry_a;
+            BitField<39, 5, u64> entry_b;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } imm;
+
+        union {
+            BitField<20, 14, u64> cb_index;
+            BitField<34, 5, u64> cb_offset;
+            BitField<56, 1, u64> neg;
+            BitField<57, 1, u64> uses_cc;
+        } hi;
+
+        union {
+            BitField<20, 14, u64> cb_index;
+            BitField<34, 5, u64> cb_offset;
+            BitField<39, 5, u64> entry_a;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } rz;
+
+        union {
+            BitField<39, 5, u64> entry_a;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } r1;
+
+        union {
+            BitField<28, 8, u64> entry_a;
+            BitField<37, 1, u64> neg;
+            BitField<38, 1, u64> uses_cc;
+        } r2;
+
+    } lea;
+
+    union {
         BitField<0, 5, FlowCondition> cond;
     } flow;
 
     union {
+        BitField<47, 1, u64> cc;
         BitField<48, 1, u64> negate_b;
         BitField<49, 1, u64> negate_c;
+        BitField<51, 2, u64> tab5980_1;
+        BitField<53, 2, u64> tab5980_0;
     } ffma;
 
     union {
@@ -446,6 +578,27 @@ union Instruction {
     } psetp;
 
     union {
+        BitField<12, 3, u64> pred12;
+        BitField<15, 1, u64> neg_pred12;
+        BitField<24, 2, PredOperation> cond;
+        BitField<29, 3, u64> pred29;
+        BitField<32, 1, u64> neg_pred29;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred39;
+        BitField<44, 1, u64> bf;
+        BitField<45, 2, PredOperation> op;
+    } pset;
+
+    union {
+        BitField<0, 3, u64> pred0;
+        BitField<3, 3, u64> pred3;
+        BitField<8, 5, ControlCode> cc; // flag in cc
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred39;
+        BitField<45, 4, PredOperation> op; // op with pred39
+    } csetp;
+
+    union {
         BitField<39, 3, u64> pred39;
         BitField<42, 1, u64> neg_pred;
         BitField<43, 1, u64> neg_a;
@@ -490,25 +643,127 @@ union Instruction {
         BitField<28, 1, u64> array;
         BitField<29, 2, TextureType> texture_type;
         BitField<31, 4, u64> component_mask;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<54, 1, u64> aoffi_flag;
+        BitField<55, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(size_t component) const {
+        bool IsComponentEnabled(std::size_t component) const {
             return ((1ull << component) & component_mask) != 0;
         }
+
+        TextureProcessMode GetTextureProcessMode() const {
+            return process_mode;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return aoffi_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
     } tex;
 
     union {
+        BitField<22, 6, TextureQueryType> query_type;
+        BitField<31, 4, u64> component_mask;
+        BitField<49, 1, u64> nodep_flag;
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
+    } txq;
+
+    union {
         BitField<28, 1, u64> array;
         BitField<29, 2, TextureType> texture_type;
+        BitField<31, 4, u64> component_mask;
+        BitField<35, 1, u64> ndv_flag;
+        BitField<49, 1, u64> nodep_flag;
+
+        bool IsComponentEnabled(std::size_t component) const {
+            return ((1ull << component) & component_mask) != 0;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::NDV:
+                return (ndv_flag != 0);
+            case TextureMiscMode::NODEP:
+                return (nodep_flag != 0);
+            default:
+                break;
+            }
+            return false;
+        }
+    } tmml;
+
+    union {
+        BitField<28, 1, u64> array;
+        BitField<29, 2, TextureType> texture_type;
+        BitField<35, 1, u64> ndv_flag;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<54, 2, u64> info;
         BitField<56, 2, u64> component;
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::NDV:
+                return ndv_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return info == 1;
+            case TextureMiscMode::PTP:
+                return info == 2;
+            default:
+                break;
+            }
+            return false;
+        }
     } tld4;
 
     union {
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<51, 1, u64> aoffi_flag;
         BitField<52, 2, u64> component;
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return aoffi_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
     } tld4s;
 
     union {
         BitField<0, 8, Register> gpr0;
         BitField<28, 8, Register> gpr28;
+        BitField<49, 1, u64> nodep_flag;
         BitField<50, 3, u64> component_mask_selector;
         BitField<53, 4, u64> texture_info;
 
@@ -528,6 +783,37 @@ union Instruction {
             UNREACHABLE();
         }
 
+        TextureProcessMode GetTextureProcessMode() const {
+            switch (texture_info) {
+            case 0:
+            case 2:
+            case 6:
+            case 8:
+            case 9:
+            case 11:
+                return TextureProcessMode::LZ;
+            case 3:
+            case 5:
+            case 13:
+                return TextureProcessMode::LL;
+            default:
+                break;
+            }
+            return TextureProcessMode::None;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::DC:
+                return (texture_info >= 4 && texture_info <= 6) || texture_info == 9;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
+
         bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info >= 7 && texture_info <= 9;
@@ -537,7 +823,7 @@ union Instruction {
             return gpr28.Value() != Register::ZeroIndex;
         }
 
-        bool IsComponentEnabled(size_t component) const {
+        bool IsComponentEnabled(std::size_t component) const {
             static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
                 {},
                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@@ -545,7 +831,7 @@ union Instruction {
                 {0x7, 0xb, 0xd, 0xe, 0xf},
             }};
 
-            size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
+            std::size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
             index |= gpr28.Value() != Register::ZeroIndex ? 2 : 0;
 
             u32 mask = mask_lut[index][component_mask_selector];
@@ -556,6 +842,7 @@ union Instruction {
     } texs;
 
     union {
+        BitField<49, 1, u64> nodep_flag;
         BitField<53, 4, u64> texture_info;
 
         TextureType GetTextureType() const {
@@ -576,6 +863,26 @@ union Instruction {
             UNREACHABLE();
         }
 
+        TextureProcessMode GetTextureProcessMode() const {
+            if (texture_info == 1 || texture_info == 5 || texture_info == 12)
+                return TextureProcessMode::LL;
+            return TextureProcessMode::LZ;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::AOFFI:
+                return texture_info == 12 || texture_info == 4;
+            case TextureMiscMode::MZ:
+                return texture_info == 5;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
+
         bool IsArrayTexture() const {
             // TEXS only supports Texture2D arrays.
             return texture_info == 8;
@@ -618,6 +925,7 @@ union Instruction {
         BitField<36, 5, u64> index;
     } cbuf36;
 
+    BitField<47, 1, u64> generates_cc;
     BitField<61, 1, u64> is_b_imm;
     BitField<60, 1, u64> is_b_gpr;
     BitField<59, 1, u64> is_c_gpr;
@@ -647,11 +955,13 @@ public:
         LDG, // Load from global memory
         STG, // Store in global memory
         TEX,
-        TEXQ,  // Texture Query
-        TEXS,  // Texture Fetch with scalar/non-vec4 source/destinations
-        TLDS,  // Texture Load with scalar/non-vec4 source/destinations
-        TLD4,  // Texture Load 4
-        TLD4S, // Texture Load 4 with scalar / non - vec4 source / destinations
+        TXQ,    // Texture Query
+        TEXS,   // Texture Fetch with scalar/non-vec4 source/destinations
+        TLDS,   // Texture Load with scalar/non-vec4 source/destinations
+        TLD4,   // Texture Load 4
+        TLD4S,  // Texture Load 4 with scalar / non - vec4 source / destinations
+        TMML_B, // Texture Mip Map Level
+        TMML,   // Texture Mip Map Level
         EXIT,
         IPA,
         FFMA_IMM, // Fused Multiply and Add
@@ -676,6 +986,11 @@ public:
         ISCADD_C, // Scale and Add
         ISCADD_R,
         ISCADD_IMM,
+        LEA_R1,
+        LEA_R2,
+        LEA_RZ,
+        LEA_IMM,
+        LEA_HI,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -734,6 +1049,8 @@ public:
         ISET_C,
         ISET_IMM,
         PSETP,
+        PSET,
+        CSETP,
         XMAD_IMM,
         XMAD_CR,
         XMAD_RC,
@@ -757,6 +1074,7 @@ public:
         IntegerSet,
         IntegerSetPredicate,
         PredicateSetPredicate,
+        PredicateSetRegister,
         Conversion,
         Xmad,
         Unknown,
@@ -821,7 +1139,7 @@ public:
 private:
     struct Detail {
     private:
-        static constexpr size_t opcode_bitsize = 16;
+        static constexpr std::size_t opcode_bitsize = 16;
 
         /**
          * Generates the mask and the expected value after masking from a given bitstring.
@@ -830,8 +1148,8 @@ private:
          */
         static auto GetMaskAndExpect(const char* const bitstring) {
             u16 mask = 0, expect = 0;
-            for (size_t i = 0; i < opcode_bitsize; i++) {
-                const size_t bit_position = opcode_bitsize - i - 1;
+            for (std::size_t i = 0; i < opcode_bitsize; i++) {
+                const std::size_t bit_position = opcode_bitsize - i - 1;
                 switch (bitstring[i]) {
                 case '0':
                     mask |= 1 << bit_position;
@@ -871,11 +1189,13 @@ private:
             INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
             INST("110000----111---", Id::TEX, Type::Memory, "TEX"),
-            INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"),
+            INST("1101111101001---", Id::TXQ, Type::Memory, "TXQ"),
             INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"),
             INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"),
             INST("110010----111---", Id::TLD4, Type::Memory, "TLD4"),
             INST("1101111100------", Id::TLD4S, Type::Memory, "TLD4S"),
+            INST("110111110110----", Id::TMML_B, Type::Memory, "TMML_B"),
+            INST("1101111101011---", Id::TMML, Type::Memory, "TMML"),
             INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"),
             INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
             INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
@@ -906,6 +1226,11 @@ private:
             INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
             INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
             INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
+            INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"),
+            INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"),
+            INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"),
+            INST("010010111101----", Id::LEA_RZ, Type::ArithmeticInteger, "LEA_RZ"),
+            INST("00011000--------", Id::LEA_HI, Type::ArithmeticInteger, "LEA_HI"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
@@ -960,7 +1285,9 @@ private:
             INST("010110110101----", Id::ISET_R, Type::IntegerSet, "ISET_R"),
             INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
             INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
+            INST("0101000010001---", Id::PSET, Type::PredicateSetRegister, "PSET"),
             INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
+            INST("010100001010----", Id::CSETP, Type::PredicateSetPredicate, "CSETP"),
             INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"),
             INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"),
             INST("010100010-------", Id::XMAD_RC, Type::Xmad, "XMAD_RC"),
diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h
new file mode 100644
index 000000000..a885ee3cf
--- /dev/null
+++ b/src/video_core/engines/shader_header.h
@@ -0,0 +1,103 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::Shader {
+
+enum class OutputTopology : u32 {
+    PointList = 1,
+    LineStrip = 6,
+    TriangleStrip = 7,
+};
+
+// Documentation in:
+// http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html#ImapTexture
+struct Header {
+    union {
+        BitField<0, 5, u32> sph_type;
+        BitField<5, 5, u32> version;
+        BitField<10, 4, u32> shader_type;
+        BitField<14, 1, u32> mrt_enable;
+        BitField<15, 1, u32> kills_pixels;
+        BitField<16, 1, u32> does_global_store;
+        BitField<17, 4, u32> sass_version;
+        BitField<21, 5, u32> reserved;
+        BitField<26, 1, u32> does_load_or_store;
+        BitField<27, 1, u32> does_fp64;
+        BitField<28, 4, u32> stream_out_mask;
+    } common0;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_low_size;
+        BitField<24, 8, u32> per_patch_attribute_count;
+    } common1;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_high_size;
+        BitField<24, 8, u32> threads_per_input_primitive;
+    } common2;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_crs_size;
+        BitField<24, 4, OutputTopology> output_topology;
+        BitField<28, 4, u32> reserved;
+    } common3;
+
+    union {
+        BitField<0, 12, u32> max_output_vertices;
+        BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
+        BitField<24, 4, u32> reserved;
+        BitField<12, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
+    } common4;
+
+    union {
+        struct {
+            INSERT_PADDING_BYTES(3);  // ImapSystemValuesA
+            INSERT_PADDING_BYTES(1);  // ImapSystemValuesB
+            INSERT_PADDING_BYTES(16); // ImapGenericVector[32]
+            INSERT_PADDING_BYTES(2);  // ImapColor
+            INSERT_PADDING_BYTES(2);  // ImapSystemValuesC
+            INSERT_PADDING_BYTES(5);  // ImapFixedFncTexture[10]
+            INSERT_PADDING_BYTES(1);  // ImapReserved
+            INSERT_PADDING_BYTES(3);  // OmapSystemValuesA
+            INSERT_PADDING_BYTES(1);  // OmapSystemValuesB
+            INSERT_PADDING_BYTES(16); // OmapGenericVector[32]
+            INSERT_PADDING_BYTES(2);  // OmapColor
+            INSERT_PADDING_BYTES(2);  // OmapSystemValuesC
+            INSERT_PADDING_BYTES(5);  // OmapFixedFncTexture[10]
+            INSERT_PADDING_BYTES(1);  // OmapReserved
+        } vtg;
+
+        struct {
+            INSERT_PADDING_BYTES(3);  // ImapSystemValuesA
+            INSERT_PADDING_BYTES(1);  // ImapSystemValuesB
+            INSERT_PADDING_BYTES(32); // ImapGenericVector[32]
+            INSERT_PADDING_BYTES(2);  // ImapColor
+            INSERT_PADDING_BYTES(2);  // ImapSystemValuesC
+            INSERT_PADDING_BYTES(10); // ImapFixedFncTexture[10]
+            INSERT_PADDING_BYTES(2);  // ImapReserved
+            struct {
+                u32 target;
+                union {
+                    BitField<0, 1, u32> sample_mask;
+                    BitField<1, 1, u32> depth;
+                    BitField<2, 30, u32> reserved;
+                };
+            } omap;
+            bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const {
+                const u32 bit = render_target * 4 + component;
+                return omap.target & (1 << bit);
+            }
+        } ps;
+    };
+};
+
+static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
+
+} // namespace Tegra::Shader
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index e6d8e65c6..baa8b63b7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -27,6 +28,7 @@ GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
     fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager);
 }
 
 GPU::~GPU() = default;
@@ -66,6 +68,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RGBA8_UINT:
     case RenderTargetFormat::RGB10_A2_UNORM:
     case RenderTargetFormat::BGRA8_UNORM:
+    case RenderTargetFormat::BGRA8_SRGB:
     case RenderTargetFormat::RG16_UNORM:
     case RenderTargetFormat::RG16_SNORM:
     case RenderTargetFormat::RG16_UINT:
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 2c3dbd97b..5cc1e19ca 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -4,8 +4,9 @@
 
 #pragma once
 
+#include <array>
 #include <memory>
-#include <unordered_map>
+#include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/memory_manager.h"
@@ -26,6 +27,7 @@ enum class RenderTargetFormat : u32 {
     RG32_FLOAT = 0xCB,
     RG32_UINT = 0xCD,
     BGRA8_UNORM = 0xCF,
+    BGRA8_SRGB = 0xD0,
     RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
     RGBA8_SRGB = 0xD6,
@@ -40,6 +42,7 @@ enum class RenderTargetFormat : u32 {
     R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
     B5G6R5_UNORM = 0xE8,
+    BGR5A1_UNORM = 0xE9,
     RG8_UNORM = 0xEA,
     RG8_SNORM = 0xEB,
     R16_UNORM = 0xEE,
@@ -67,6 +70,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
 /// Returns the number of bytes per pixel of each depth format.
 u32 DepthFormatBytesPerPixel(DepthFormat format);
 
+struct CommandListHeader;
 class DebugContext;
 
 /**
@@ -99,6 +103,7 @@ class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
 class MaxwellDMA;
+class KeplerMemory;
 } // namespace Engines
 
 enum class EngineID {
@@ -115,7 +120,7 @@ public:
     ~GPU();
 
     /// Processes a command list stored at the specified address in GPU memory.
-    void ProcessCommandList(GPUVAddr address, u32 size);
+    void ProcessCommandLists(const std::vector<CommandListHeader>& commands);
 
     /// Returns a reference to the Maxwell3D GPU engine.
     Engines::Maxwell3D& Maxwell3D();
@@ -130,13 +135,10 @@ public:
     const Tegra::MemoryManager& MemoryManager() const;
 
 private:
-    /// Writes a single register in the engine bound to the specified subchannel
-    void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params);
-
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
 
     /// Mapping of command subchannels to their bound engine ids.
-    std::unordered_map<u32, EngineID> bound_engines;
+    std::array<EngineID, 8> bound_engines = {};
 
     /// 3D engine
     std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
@@ -146,6 +148,8 @@ private:
     std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
     /// DMA engine
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+    /// Inline memory engine
+    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h
index 7d836b816..cee0baaf3 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro_interpreter.h
@@ -152,7 +152,7 @@ private:
     boost::optional<u32>
         delayed_pc; ///< Program counter to execute at after the delay slot is executed.
 
-    static constexpr size_t NumMacroRegisters = 8;
+    static constexpr std::size_t NumMacroRegisters = 8;
 
     /// General purpose macro registers.
     std::array<u32, NumMacroRegisters> registers = {};
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 9d78e8b6b..cd819d69f 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -20,9 +20,6 @@ public:
     /// Clear the current framebuffer
     virtual void Clear() = 0;
 
-    /// Notify rasterizer that the specified Maxwell register has been changed
-    virtual void NotifyMaxwellRegisterChanged(u32 method) = 0;
-
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp
index be17a2b9c..0df3725c2 100644
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -19,6 +19,7 @@ void RendererBase::RefreshBaseSettings() {
     UpdateCurrentFramebufferLayout();
 
     renderer_settings.use_framelimiter = Settings::values.use_frame_limit;
+    renderer_settings.set_background_color = true;
 }
 
 void RendererBase::UpdateCurrentFramebufferLayout() {
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 2a357f9d0..2cd0738ff 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -19,6 +19,7 @@ namespace VideoCore {
 
 struct RendererSettings {
     std::atomic_bool use_framelimiter{false};
+    std::atomic_bool set_background_color{false};
 };
 
 class RendererBase : NonCopyable {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
new file mode 100644
index 000000000..578aca789
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -0,0 +1,93 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <memory>
+
+#include "common/alignment.h"
+#include "core/core.h"
+#include "core/memory.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+
+namespace OpenGL {
+
+OGLBufferCache::OGLBufferCache(std::size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {}
+
+GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size,
+                                      std::size_t alignment, bool cache) {
+    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+    const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
+
+    // Cache management is a big overhead, so only cache entries with a given size.
+    // TODO: Figure out which size is the best for given games.
+    cache &= size >= 2048;
+
+    if (cache) {
+        auto entry = TryGet(*cpu_addr);
+        if (entry) {
+            if (entry->size >= size && entry->alignment == alignment) {
+                return entry->offset;
+            }
+            Unregister(entry);
+        }
+    }
+
+    AlignBuffer(alignment);
+    GLintptr uploaded_offset = buffer_offset;
+
+    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+
+    buffer_ptr += size;
+    buffer_offset += size;
+
+    if (cache) {
+        auto entry = std::make_shared<CachedBufferEntry>();
+        entry->offset = uploaded_offset;
+        entry->size = size;
+        entry->alignment = alignment;
+        entry->addr = *cpu_addr;
+        Register(entry);
+    }
+
+    return uploaded_offset;
+}
+
+GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                          std::size_t alignment) {
+    AlignBuffer(alignment);
+    std::memcpy(buffer_ptr, raw_pointer, size);
+    GLintptr uploaded_offset = buffer_offset;
+
+    buffer_ptr += size;
+    buffer_offset += size;
+    return uploaded_offset;
+}
+
+void OGLBufferCache::Map(std::size_t max_size) {
+    bool invalidate;
+    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
+        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
+    buffer_offset = buffer_offset_base;
+
+    if (invalidate) {
+        InvalidateAll();
+    }
+}
+void OGLBufferCache::Unmap() {
+    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+}
+
+GLuint OGLBufferCache::GetHandle() const {
+    return stream_buffer.GetHandle();
+}
+
+void OGLBufferCache::AlignBuffer(std::size_t alignment) {
+    // Align the offset, not the mapped pointer
+    GLintptr offset_aligned =
+        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
+    buffer_ptr += offset_aligned - buffer_offset;
+    buffer_offset = offset_aligned;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
new file mode 100644
index 000000000..6c18461f4
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -0,0 +1,57 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+#include "common/common_types.h"
+#include "video_core/rasterizer_cache.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_stream_buffer.h"
+
+namespace OpenGL {
+
+struct CachedBufferEntry final {
+    VAddr GetAddr() const {
+        return addr;
+    }
+
+    std::size_t GetSizeInBytes() const {
+        return size;
+    }
+
+    VAddr addr;
+    std::size_t size;
+    GLintptr offset;
+    std::size_t alignment;
+};
+
+class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
+public:
+    explicit OGLBufferCache(std::size_t size);
+
+    GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                          bool cache = true);
+
+    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
+
+    void Map(std::size_t max_size);
+    void Unmap();
+
+    GLuint GetHandle() const;
+
+protected:
+    void AlignBuffer(std::size_t alignment);
+
+private:
+    OGLStreamBuffer stream_buffer;
+
+    u8* buffer_ptr = nullptr;
+    GLintptr buffer_offset = 0;
+    GLintptr buffer_offset_base = 0;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7ce969f73..70fb54507 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <array>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -33,16 +34,19 @@ using PixelFormat = SurfaceParams::PixelFormat;
 using SurfaceType = SurfaceParams::SurfaceType;
 
 MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_VS, "OpenGL", "Vertex Shader Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_FS, "OpenGL", "Fragment Shader Setup", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
+MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info)
-    : emu_window{window}, screen_info{info}, stream_buffer(GL_ARRAY_BUFFER, STREAM_BUFFER_SIZE) {
+    : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) {
     // Create sampler objects
-    for (size_t i = 0; i < texture_samplers.size(); ++i) {
+    for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
         state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
     }
@@ -55,6 +59,8 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
 
         if (extension == "GL_ARB_direct_state_access") {
             has_ARB_direct_state_access = true;
+        } else if (extension == "GL_ARB_multi_bind") {
+            has_ARB_multi_bind = true;
         } else if (extension == "GL_ARB_separate_shader_objects") {
             has_ARB_separate_shader_objects = true;
         } else if (extension == "GL_ARB_vertex_attrib_binding") {
@@ -67,28 +73,13 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
     // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
     state.clip_distance[0] = true;
 
-    // Generate VAO and UBO
-    sw_vao.Create();
-    uniform_buffer.Create();
-
-    state.draw.vertex_array = sw_vao.handle;
-    state.draw.uniform_buffer = uniform_buffer.handle;
-    state.Apply();
-
     // Create render framebuffer
     framebuffer.Create();
 
-    hw_vao.Create();
-
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
-
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
-    state.draw.vertex_array = hw_vao.handle;
     state.Apply();
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer.GetHandle());
-
     glEnable(GL_BLEND);
 
     glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
@@ -98,14 +89,60 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
 
 RasterizerOpenGL::~RasterizerOpenGL() {}
 
-std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
-                                                             GLintptr buffer_offset) {
+void RasterizerOpenGL::SetupVertexArrays() {
     MICROPROFILE_SCOPE(OpenGL_VAO);
     const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    state.draw.vertex_array = hw_vao.handle;
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
+    auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format);
+    auto& VAO = iter->second;
+
+    if (is_cache_miss) {
+        VAO.Create();
+        state.draw.vertex_array = VAO.handle;
+        state.Apply();
+
+        // The index buffer binding is stored within the VAO. Stupid OpenGL, but easy to work
+        // around.
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer_cache.GetHandle());
+
+        // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
+        // Enables the first 16 vertex attributes always, as we don't know which ones are actually
+        // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
+        // for now to avoid OpenGL errors.
+        // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
+        // assume every shader uses them all.
+        for (unsigned index = 0; index < 16; ++index) {
+            const auto& attrib = regs.vertex_attrib_format[index];
+
+            // Ignore invalid attributes.
+            if (!attrib.IsValid())
+                continue;
+
+            const auto& buffer = regs.vertex_array[attrib.buffer];
+            LOG_TRACE(HW_GPU,
+                      "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
+                      index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
+                      attrib.offset.Value(), attrib.IsNormalized());
+
+            ASSERT(buffer.IsEnabled());
+
+            glEnableVertexAttribArray(index);
+            if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt ||
+                attrib.type ==
+                    Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) {
+                glVertexAttribIFormat(index, attrib.ComponentCount(),
+                                      MaxwellToGL::VertexType(attrib), attrib.offset);
+            } else {
+                glVertexAttribFormat(index, attrib.ComponentCount(),
+                                     MaxwellToGL::VertexType(attrib),
+                                     attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
+            }
+            glVertexAttribBinding(index, attrib.buffer);
+        }
+    }
+    state.draw.vertex_array = VAO.handle;
+    state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
     // Upload all guest vertex arrays sequentially to our buffer
@@ -117,77 +154,35 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         Tegra::GPUVAddr start = vertex_array.StartAddress();
         const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
-        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
-            start += vertex_array.stride * (gpu.state.current_instance / vertex_array.divisor);
-        }
-
         ASSERT(end > start);
-        u64 size = end - start + 1;
-
-        GLintptr vertex_buffer_offset;
-        std::tie(array_ptr, buffer_offset, vertex_buffer_offset) =
-            UploadMemory(array_ptr, buffer_offset, start, size);
+        const u64 size = end - start + 1;
+        const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
+        glBindVertexBuffer(index, buffer_cache.GetHandle(), vertex_buffer_offset,
                            vertex_array.stride);
 
         if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
-            // Tell OpenGL that this is an instanced vertex buffer to prevent accessing different
-            // indexes on each vertex. We do the instance indexing manually by incrementing the
-            // start address of the vertex buffer.
-            glVertexBindingDivisor(index, 1);
+            // Enable vertex buffer instancing with the specified divisor.
+            glVertexBindingDivisor(index, vertex_array.divisor);
         } else {
             // Disable the vertex buffer instancing.
             glVertexBindingDivisor(index, 0);
         }
     }
-
-    // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
-    // Enables the first 16 vertex attributes always, as we don't know which ones are actually used
-    // until shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now
-    // to avoid OpenGL errors.
-    // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
-    // assume every shader uses them all.
-    for (unsigned index = 0; index < 16; ++index) {
-        auto& attrib = regs.vertex_attrib_format[index];
-
-        // Ignore invalid attributes.
-        if (!attrib.IsValid())
-            continue;
-
-        auto& buffer = regs.vertex_array[attrib.buffer];
-        LOG_TRACE(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
-                  index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
-                  attrib.offset.Value(), attrib.IsNormalized());
-
-        ASSERT(buffer.IsEnabled());
-
-        glEnableVertexAttribArray(index);
-        if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt ||
-            attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) {
-            glVertexAttribIFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
-                                  attrib.offset);
-        } else {
-            glVertexAttribFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
-                                 attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
-        }
-        glVertexAttribBinding(index, attrib.buffer);
-    }
-
-    return {array_ptr, buffer_offset};
 }
 
-std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+void RasterizerOpenGL::SetupShaders() {
+    MICROPROFILE_SCOPE(OpenGL_Shader);
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
 
     // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
     // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points.
     u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage;
     u32 current_texture_bindpoint = 0;
 
-    for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
-        auto& shader_config = gpu.regs.shader_config[index];
+    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+        const auto& shader_config = gpu.regs.shader_config[index];
         const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
 
         // Skip stages that are not enabled
@@ -195,21 +190,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
             continue;
         }
 
-        std::tie(buffer_ptr, buffer_offset) =
-            AlignBuffer(buffer_ptr, buffer_offset, static_cast<size_t>(uniform_buffer_alignment));
-
-        const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
+        const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu.state.shader_stages[stage]);
-        std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
+        const GLintptr offset = buffer_cache.UploadHostMemory(
+            &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
 
         // Bind the buffer
-        glBindBufferRange(GL_UNIFORM_BUFFER, stage, stream_buffer.GetHandle(), buffer_offset,
-                          sizeof(ubo));
-
-        buffer_ptr += sizeof(ubo);
-        buffer_offset += sizeof(ubo);
+        glBindBufferRange(GL_UNIFORM_BUFFER, stage, buffer_cache.GetHandle(), offset, sizeof(ubo));
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
@@ -230,9 +219,8 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
         }
 
         // Configure the const buffers for this shader stage.
-        std::tie(buffer_ptr, buffer_offset, current_constbuffer_bindpoint) =
-            SetupConstBuffers(buffer_ptr, buffer_offset, static_cast<Maxwell::ShaderStage>(stage),
-                              shader, current_constbuffer_bindpoint);
+        current_constbuffer_bindpoint = SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage),
+                                                          shader, current_constbuffer_bindpoint);
 
         // Configure the textures for this shader stage.
         current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader,
@@ -245,15 +233,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr
         }
     }
 
-    shader_program_manager->UseTrivialGeometryShader();
+    state.Apply();
 
-    return {buffer_ptr, buffer_offset};
+    shader_program_manager->UseTrivialGeometryShader();
 }
 
-size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
+std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
-    size_t size = 0;
+    std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
         if (!regs.vertex_array[index].IsEnabled())
             continue;
@@ -309,60 +297,80 @@ void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
         cached_pages.add({pages_interval, delta});
 }
 
-std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb,
-                                                                    bool using_depth_fb,
-                                                                    bool preserve_contents) {
+void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb,
+                                             bool preserve_contents,
+                                             boost::optional<std::size_t> single_color_target) {
+    MICROPROFILE_SCOPE(OpenGL_Framebuffer);
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
-    if (regs.rt[0].format == Tegra::RenderTargetFormat::NONE) {
-        LOG_ERROR(HW_GPU, "RenderTargetFormat is not configured");
-        using_color_fb = false;
+    Surface depth_surface;
+    if (using_depth_fb) {
+        depth_surface = res_cache.GetDepthBufferSurface(preserve_contents);
     }
 
-    const bool has_stencil = regs.stencil_enable;
-    const bool write_color_fb =
-        state.color_mask.red_enabled == GL_TRUE || state.color_mask.green_enabled == GL_TRUE ||
-        state.color_mask.blue_enabled == GL_TRUE || state.color_mask.alpha_enabled == GL_TRUE;
+    // TODO(bunnei): Figure out how the below register works. According to envytools, this should be
+    // used to enable multiple render targets. However, it is left unset on all games that I have
+    // tested.
+    ASSERT_MSG(regs.rt_separate_frag_data == 0, "Unimplemented");
 
-    const bool write_depth_fb =
-        (state.depth.test_enabled && state.depth.write_mask == GL_TRUE) ||
-        (has_stencil && (state.stencil.front.write_mask || state.stencil.back.write_mask));
-
-    Surface color_surface;
-    Surface depth_surface;
-    MathUtil::Rectangle<u32> surfaces_rect;
-    std::tie(color_surface, depth_surface, surfaces_rect) =
-        res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb, preserve_contents);
+    // Bind the framebuffer surfaces
+    state.draw.draw_framebuffer = framebuffer.handle;
+    state.Apply();
 
-    const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
-    const MathUtil::Rectangle<u32> draw_rect{
-        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left,
-                                         surfaces_rect.left, surfaces_rect.right)), // Left
-        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top,
-                                         surfaces_rect.bottom, surfaces_rect.top)), // Top
-        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.right,
-                                         surfaces_rect.left, surfaces_rect.right)), // Right
-        static_cast<u32>(
-            std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.bottom,
-                            surfaces_rect.bottom, surfaces_rect.top))}; // Bottom
+    if (using_color_fb) {
+        if (single_color_target) {
+            // Used when just a single color attachment is enabled, e.g. for clearing a color buffer
+            Surface color_surface =
+                res_cache.GetColorBufferSurface(*single_color_target, preserve_contents);
+            glFramebufferTexture2D(
+                GL_DRAW_FRAMEBUFFER,
+                GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target), GL_TEXTURE_2D,
+                color_surface != nullptr ? color_surface->Texture().handle : 0, 0);
+            glDrawBuffer(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target));
+        } else {
+            // Multiple color attachments are enabled
+            std::array<GLenum, Maxwell::NumRenderTargets> buffers;
+            for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+                Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents);
+                buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index);
+                glFramebufferTexture2D(
+                    GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index),
+                    GL_TEXTURE_2D, color_surface != nullptr ? color_surface->Texture().handle : 0,
+                    0);
+            }
+            glDrawBuffers(regs.rt_control.count, buffers.data());
+        }
+    } else {
+        // No color attachments are enabled - zero out all of them
+        for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER,
+                                   GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), GL_TEXTURE_2D,
+                                   0, 0);
+        }
+        glDrawBuffer(GL_NONE);
+    }
 
-    // Bind the framebuffer surfaces
-    BindFramebufferSurfaces(color_surface, depth_surface, has_stencil);
+    if (depth_surface) {
+        if (regs.stencil_enable) {
+            // Attach both depth and stencil
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->Texture().handle, 0);
+        } else {
+            // Attach depth
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->Texture().handle, 0);
+            // Clear stencil attachment
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+        }
+    } else {
+        // Clear both depth and stencil attachment
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
 
-    SyncViewport(surfaces_rect);
+    SyncViewport();
 
-    // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable
-    // scissor test to prevent drawing outside of the framebuffer region
-    state.scissor.enabled = true;
-    state.scissor.x = draw_rect.left;
-    state.scissor.y = draw_rect.bottom;
-    state.scissor.width = draw_rect.GetWidth();
-    state.scissor.height = draw_rect.GetHeight();
     state.Apply();
-
-    // Only return the surface to be marked as dirty if writing to it is enabled.
-    return std::make_pair(write_color_fb ? color_surface : nullptr,
-                          write_depth_fb ? depth_surface : nullptr);
 }
 
 void RasterizerOpenGL::Clear() {
@@ -370,32 +378,24 @@ void RasterizerOpenGL::Clear() {
     SCOPE_EXIT({ prev_state.Apply(); });
 
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
-    bool use_color_fb = false;
-    bool use_depth_fb = false;
+    bool use_color{};
+    bool use_depth{};
+    bool use_stencil{};
 
     OpenGLState clear_state;
-    clear_state.draw.draw_framebuffer = state.draw.draw_framebuffer;
+    clear_state.draw.draw_framebuffer = framebuffer.handle;
     clear_state.color_mask.red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE;
     clear_state.color_mask.green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE;
     clear_state.color_mask.blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE;
     clear_state.color_mask.alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE;
 
-    GLbitfield clear_mask{};
     if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
         regs.clear_buffers.A) {
-        if (regs.clear_buffers.RT == 0) {
-            // We only support clearing the first color attachment for now
-            clear_mask |= GL_COLOR_BUFFER_BIT;
-            use_color_fb = true;
-        } else {
-            // TODO(subv): Add support for the other color attachments
-            LOG_CRITICAL(HW_GPU, "Clear unimplemented for RT {}", regs.clear_buffers.RT);
-        }
+        use_color = true;
     }
     if (regs.clear_buffers.Z) {
         ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!");
-        use_depth_fb = true;
-        clear_mask |= GL_DEPTH_BUFFER_BIT;
+        use_depth = true;
 
         // Always enable the depth write when clearing the depth buffer. The depth write mask is
         // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to true.
@@ -404,59 +404,33 @@ void RasterizerOpenGL::Clear() {
     }
     if (regs.clear_buffers.S) {
         ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
-        use_depth_fb = true;
-        clear_mask |= GL_STENCIL_BUFFER_BIT;
+        use_stencil = true;
         clear_state.stencil.test_enabled = true;
     }
 
-    if (!use_color_fb && !use_depth_fb) {
+    if (!use_color && !use_depth && !use_stencil) {
         // No color surface nor depth/stencil surface are enabled
         return;
     }
 
-    if (clear_mask == 0) {
-        // No clear mask is enabled
-        return;
-    }
-
     ScopeAcquireGLContext acquire_context{emu_window};
 
-    auto [dirty_color_surface, dirty_depth_surface] =
-        ConfigureFramebuffers(use_color_fb, use_depth_fb, false);
+    ConfigureFramebuffers(use_color, use_depth || use_stencil, false,
+                          regs.clear_buffers.RT.Value());
 
     clear_state.Apply();
 
-    glClearColor(regs.clear_color[0], regs.clear_color[1], regs.clear_color[2],
-                 regs.clear_color[3]);
-    glClearDepth(regs.clear_depth);
-    glClearStencil(regs.clear_stencil);
-
-    glClear(clear_mask);
-}
-
-std::pair<u8*, GLintptr> RasterizerOpenGL::AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset,
-                                                       size_t alignment) {
-    // Align the offset, not the mapped pointer
-    GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
-    return {buffer_ptr + (offset_aligned - buffer_offset), offset_aligned};
-}
-
-std::tuple<u8*, GLintptr, GLintptr> RasterizerOpenGL::UploadMemory(u8* buffer_ptr,
-                                                                   GLintptr buffer_offset,
-                                                                   Tegra::GPUVAddr gpu_addr,
-                                                                   size_t size, size_t alignment) {
-    std::tie(buffer_ptr, buffer_offset) = AlignBuffer(buffer_ptr, buffer_offset, alignment);
-    GLintptr uploaded_offset = buffer_offset;
-
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
-    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
-
-    buffer_ptr += size;
-    buffer_offset += size;
+    if (use_color) {
+        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+    }
 
-    return {buffer_ptr, buffer_offset, uploaded_offset};
+    if (use_depth && use_stencil) {
+        glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
+    } else if (use_depth) {
+        glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
+    } else if (use_stencil) {
+        glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
+    }
 }
 
 void RasterizerOpenGL::DrawArrays() {
@@ -464,12 +438,12 @@ void RasterizerOpenGL::DrawArrays() {
         return;
 
     MICROPROFILE_SCOPE(OpenGL_Drawing);
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
 
     ScopeAcquireGLContext acquire_context{emu_window};
 
-    auto [dirty_color_surface, dirty_depth_surface] =
-        ConfigureFramebuffers(true, regs.zeta.Address() != 0 && regs.zeta_enable != 0, true);
+    ConfigureFramebuffers();
 
     SyncDepthTestState();
     SyncStencilTestState();
@@ -482,43 +456,46 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
-    const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
+    const u64 index_buffer_size{static_cast<u64>(regs.index_array.count) *
+                                static_cast<u64>(regs.index_array.FormatSizeInBytes())};
 
-    state.draw.vertex_buffer = stream_buffer.GetHandle();
+    state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
-    size_t buffer_size = CalculateVertexArraysSize();
+    std::size_t buffer_size = CalculateVertexArraysSize();
 
     if (is_indexed) {
-        buffer_size = Common::AlignUp<size_t>(buffer_size, 4) + index_buffer_size;
+        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + index_buffer_size;
     }
 
     // Uniform space for the 5 shader stages
     buffer_size =
-        Common::AlignUp<size_t>(buffer_size, 4) +
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
         (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
     buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
 
-    u8* buffer_ptr;
-    GLintptr buffer_offset;
-    std::tie(buffer_ptr, buffer_offset, std::ignore) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(buffer_size), 4);
-    u8* buffer_ptr_base = buffer_ptr;
+    buffer_cache.Map(buffer_size);
 
-    std::tie(buffer_ptr, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
+    SetupVertexArrays();
 
     // If indexed mode, copy the index buffer
     GLintptr index_buffer_offset = 0;
     if (is_indexed) {
-        std::tie(buffer_ptr, buffer_offset, index_buffer_offset) = UploadMemory(
-            buffer_ptr, buffer_offset, regs.index_array.StartAddress(), index_buffer_size);
+        MICROPROFILE_SCOPE(OpenGL_Index);
+
+        // Adjust the index buffer offset so it points to the first desired index.
+        auto index_start = regs.index_array.StartAddress();
+        index_start += static_cast<size_t>(regs.index_array.first) *
+                       static_cast<size_t>(regs.index_array.FormatSizeInBytes());
+
+        index_buffer_offset = buffer_cache.UploadMemory(index_start, index_buffer_size);
     }
 
-    std::tie(buffer_ptr, buffer_offset) = SetupShaders(buffer_ptr, buffer_offset);
+    SetupShaders();
 
-    stream_buffer.Unmap(buffer_ptr - buffer_ptr_base);
+    buffer_cache.Unmap();
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -527,14 +504,26 @@ void RasterizerOpenGL::DrawArrays() {
     if (is_indexed) {
         const GLint base_vertex{static_cast<GLint>(regs.vb_element_base)};
 
-        // Adjust the index buffer offset so it points to the first desired index.
-        index_buffer_offset += regs.index_array.first * regs.index_array.FormatSizeInBytes();
-
-        glDrawElementsBaseVertex(primitive_mode, regs.index_array.count,
-                                 MaxwellToGL::IndexFormat(regs.index_array.format),
-                                 reinterpret_cast<const void*>(index_buffer_offset), base_vertex);
+        if (gpu.state.current_instance > 0) {
+            glDrawElementsInstancedBaseVertexBaseInstance(
+                primitive_mode, regs.index_array.count,
+                MaxwellToGL::IndexFormat(regs.index_array.format),
+                reinterpret_cast<const void*>(index_buffer_offset), 1, base_vertex,
+                gpu.state.current_instance);
+        } else {
+            glDrawElementsBaseVertex(primitive_mode, regs.index_array.count,
+                                     MaxwellToGL::IndexFormat(regs.index_array.format),
+                                     reinterpret_cast<const void*>(index_buffer_offset),
+                                     base_vertex);
+        }
     } else {
-        glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count);
+        if (gpu.state.current_instance > 0) {
+            glDrawArraysInstancedBaseInstance(primitive_mode, regs.vertex_buffer.first,
+                                              regs.vertex_buffer.count, 1,
+                                              gpu.state.current_instance);
+        } else {
+            glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count);
+        }
     }
 
     // Disable scissor test
@@ -549,24 +538,18 @@ void RasterizerOpenGL::DrawArrays() {
     state.Apply();
 }
 
-void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {}
+void RasterizerOpenGL::FlushAll() {}
 
-void RasterizerOpenGL::FlushAll() {
-    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-}
-
-void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
-    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-}
+void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {}
 
 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     res_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
+    buffer_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     InvalidateRegion(addr, size);
 }
 
@@ -614,7 +597,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 void RasterizerOpenGL::SamplerInfo::Create() {
     sampler.Create();
     mag_filter = min_filter = Tegra::Texture::TextureFilter::Linear;
-    wrap_u = wrap_v = Tegra::Texture::WrapMode::Wrap;
+    wrap_u = wrap_v = wrap_p = Tegra::Texture::WrapMode::Wrap;
 
     // default is GL_LINEAR_MIPMAP_LINEAR
     glSamplerParameteri(sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
@@ -622,7 +605,7 @@ void RasterizerOpenGL::SamplerInfo::Create() {
 }
 
 void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntry& config) {
-    GLuint s = sampler.handle;
+    const GLuint s = sampler.handle;
 
     if (mag_filter != config.mag_filter) {
         mag_filter = config.mag_filter;
@@ -641,8 +624,13 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
         wrap_v = config.wrap_v;
         glSamplerParameteri(s, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(wrap_v));
     }
+    if (wrap_p != config.wrap_p) {
+        wrap_p = config.wrap_p;
+        glSamplerParameteri(s, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(wrap_p));
+    }
 
-    if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border) {
+    if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border ||
+        wrap_p == Tegra::Texture::WrapMode::Border) {
         const GLvec4 new_border_color = {{config.border_color_r, config.border_color_g,
                                           config.border_color_b, config.border_color_a}};
         if (border_color != new_border_color) {
@@ -652,26 +640,35 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_ptr,
-                                                                   GLintptr buffer_offset,
-                                                                   Maxwell::ShaderStage stage,
-                                                                   Shader& shader,
-                                                                   u32 current_bindpoint) {
+u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader,
+                                        u32 current_bindpoint) {
+    MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
-    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)];
+    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& entries = shader->GetShaderEntries().const_buffer_entries;
 
+    constexpr u64 max_binds = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers;
+    std::array<GLuint, max_binds> bind_buffers;
+    std::array<GLintptr, max_binds> bind_offsets;
+    std::array<GLsizeiptr, max_binds> bind_sizes;
+
+    ASSERT_MSG(entries.size() <= max_binds, "Exceeded expected number of binding points.");
+
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& used_buffer = entries[bindpoint];
         const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
 
         if (!buffer.enabled) {
+            // With disabled buffers set values as zero to unbind them
+            bind_buffers[bindpoint] = 0;
+            bind_offsets[bindpoint] = 0;
+            bind_sizes[bindpoint] = 0;
             continue;
         }
 
-        size_t size = 0;
+        std::size_t size = 0;
 
         if (used_buffer.IsIndirect()) {
             // Buffer is accessed indirectly, so upload the entire thing
@@ -692,26 +689,28 @@ std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_pt
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        GLintptr const_buffer_offset;
-        std::tie(buffer_ptr, buffer_offset, const_buffer_offset) =
-            UploadMemory(buffer_ptr, buffer_offset, buffer.address, size,
-                         static_cast<size_t>(uniform_buffer_alignment));
-
-        glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint,
-                          stream_buffer.GetHandle(), const_buffer_offset, size);
+        GLintptr const_buffer_offset = buffer_cache.UploadMemory(
+            buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
 
         // Now configure the bindpoint of the buffer inside the shader
         glUniformBlockBinding(shader->GetProgramHandle(),
-                              shader->GetProgramResourceIndex(used_buffer.GetName()),
+                              shader->GetProgramResourceIndex(used_buffer),
                               current_bindpoint + bindpoint);
+
+        // Prepare values for multibind
+        bind_buffers[bindpoint] = buffer_cache.GetHandle();
+        bind_offsets[bindpoint] = const_buffer_offset;
+        bind_sizes[bindpoint] = size;
     }
 
-    state.Apply();
+    glBindBuffersRange(GL_UNIFORM_BUFFER, current_bindpoint, static_cast<GLsizei>(entries.size()),
+                       bind_buffers.data(), bind_offsets.data(), bind_sizes.data());
 
-    return {buffer_ptr, buffer_offset, current_bindpoint + static_cast<u32>(entries.size())};
+    return current_bindpoint + static_cast<u32>(entries.size());
 }
 
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, u32 current_unit) {
+    MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
     const auto& entries = shader->GetShaderEntries().texture_samplers;
@@ -721,24 +720,25 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader,
 
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry = entries[bindpoint];
-        u32 current_bindpoint = current_unit + bindpoint;
+        const u32 current_bindpoint = current_unit + bindpoint;
 
         // Bind the uniform to the sampler.
 
-        glProgramUniform1i(shader->GetProgramHandle(), shader->GetUniformLocation(entry.GetName()),
+        glProgramUniform1i(shader->GetProgramHandle(), shader->GetUniformLocation(entry),
                            current_bindpoint);
 
         const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset());
 
         if (!texture.enabled) {
-            state.texture_units[current_bindpoint].texture_2d = 0;
+            state.texture_units[current_bindpoint].texture = 0;
             continue;
         }
 
         texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
         Surface surface = res_cache.GetTextureSurface(texture);
         if (surface != nullptr) {
-            state.texture_units[current_bindpoint].texture_2d = surface->Texture().handle;
+            state.texture_units[current_bindpoint].texture = surface->Texture().handle;
+            state.texture_units[current_bindpoint].target = surface->Target();
             state.texture_units[current_bindpoint].swizzle.r =
                 MaxwellToGL::SwizzleSource(texture.tic.x_source);
             state.texture_units[current_bindpoint].swizzle.g =
@@ -749,47 +749,19 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader,
                 MaxwellToGL::SwizzleSource(texture.tic.w_source);
         } else {
             // Can occur when texture addr is null or its memory is unmapped/invalid
-            state.texture_units[current_bindpoint].texture_2d = 0;
+            state.texture_units[current_bindpoint].texture = 0;
         }
     }
 
-    state.Apply();
-
     return current_unit + static_cast<u32>(entries.size());
 }
 
-void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
-                                               const Surface& depth_surface, bool has_stencil) {
-    state.draw.draw_framebuffer = framebuffer.handle;
-    state.Apply();
-
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
-                           color_surface != nullptr ? color_surface->Texture().handle : 0, 0);
-    if (depth_surface != nullptr) {
-        if (has_stencil) {
-            // attach both depth and stencil
-            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                                   depth_surface->Texture().handle, 0);
-        } else {
-            // attach depth
-            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                                   depth_surface->Texture().handle, 0);
-            // clear stencil attachment
-            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-        }
-    } else {
-        // clear both depth and stencil attachment
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-    }
-}
-
-void RasterizerOpenGL::SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect) {
+void RasterizerOpenGL::SyncViewport() {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
     const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
 
-    state.viewport.x = static_cast<GLint>(surfaces_rect.left) + viewport_rect.left;
-    state.viewport.y = static_cast<GLint>(surfaces_rect.bottom) + viewport_rect.bottom;
+    state.viewport.x = viewport_rect.left;
+    state.viewport.y = viewport_rect.bottom;
     state.viewport.width = static_cast<GLsizei>(viewport_rect.GetWidth());
     state.viewport.height = static_cast<GLsizei>(viewport_rect.GetHeight());
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 30045ebff..bf9560bdc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -6,19 +6,23 @@
 
 #include <array>
 #include <cstddef>
+#include <map>
 #include <memory>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 #include <boost/icl/interval_map.hpp>
+#include <boost/optional.hpp>
 #include <boost/range/iterator_range.hpp>
 #include <glad/glad.h>
 
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -42,7 +46,6 @@ public:
 
     void DrawArrays() override;
     void Clear() override;
-    void NotifyMaxwellRegisterChanged(u32 method) override;
     void FlushAll() override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
@@ -70,7 +73,7 @@ public:
     };
 
     /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr size_t MaxConstbufferSize = 0x10000;
+    static constexpr std::size_t MaxConstbufferSize = 0x10000;
     static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
                   "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
 
@@ -90,17 +93,20 @@ private:
         Tegra::Texture::TextureFilter min_filter;
         Tegra::Texture::WrapMode wrap_u;
         Tegra::Texture::WrapMode wrap_v;
+        Tegra::Texture::WrapMode wrap_p;
         GLvec4 border_color;
     };
 
-    /// Configures the color and depth framebuffer states and returns the dirty <Color, Depth>
-    /// surfaces if writing was enabled.
-    std::pair<Surface, Surface> ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb,
-                                                      bool preserve_contents);
-
-    /// Binds the framebuffer color and depth surface
-    void BindFramebufferSurfaces(const Surface& color_surface, const Surface& depth_surface,
-                                 bool has_stencil);
+    /**
+     * Configures the color and depth framebuffer states.
+     * @param use_color_fb If true, configure color framebuffers.
+     * @param using_depth_fb If true, configure the depth/stencil framebuffer.
+     * @param preserve_contents If true, tries to preserve data from a previously used framebuffer.
+     * @param single_color_target Specifies if a single color buffer target should be used.
+     */
+    void ConfigureFramebuffers(bool use_color_fb = true, bool using_depth_fb = true,
+                               bool preserve_contents = true,
+                               boost::optional<std::size_t> single_color_target = {});
 
     /*
      * Configures the current constbuffers to use for the draw command.
@@ -109,9 +115,8 @@ private:
      * @param current_bindpoint The offset at which to start counting new buffer bindpoints.
      * @returns The next available bindpoint for use in the next shader stage.
      */
-    std::tuple<u8*, GLintptr, u32> SetupConstBuffers(
-        u8* buffer_ptr, GLintptr buffer_offset, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-        Shader& shader, u32 current_bindpoint);
+    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader,
+                          u32 current_bindpoint);
 
     /*
      * Configures the current textures to use for the draw command.
@@ -124,7 +129,7 @@ private:
                       u32 current_unit);
 
     /// Syncs the viewport to match the guest state
-    void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect);
+    void SyncViewport();
 
     /// Syncs the clip enabled status to match the guest state
     void SyncClipEnabled();
@@ -154,6 +159,7 @@ private:
     void SyncLogicOpState();
 
     bool has_ARB_direct_state_access = false;
+    bool has_ARB_multi_bind = false;
     bool has_ARB_separate_shader_objects = false;
     bool has_ARB_vertex_attrib_binding = false;
 
@@ -167,28 +173,23 @@ private:
     ScreenInfo& screen_info;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
-    OGLVertexArray sw_vao;
-    OGLVertexArray hw_vao;
+    std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute,
+                        Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>,
+             OGLVertexArray>
+        vertex_array_cache;
 
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
 
-    static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-    OGLStreamBuffer stream_buffer;
-    OGLBuffer uniform_buffer;
+    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
+    OGLBufferCache buffer_cache;
     OGLFramebuffer framebuffer;
     GLint uniform_buffer_alignment;
 
-    size_t CalculateVertexArraysSize() const;
-
-    std::pair<u8*, GLintptr> SetupVertexArrays(u8* array_ptr, GLintptr buffer_offset);
-
-    std::pair<u8*, GLintptr> SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
+    std::size_t CalculateVertexArraysSize() const;
 
-    std::pair<u8*, GLintptr> AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, size_t alignment);
+    void SetupVertexArrays();
 
-    std::tuple<u8*, GLintptr, GLintptr> UploadMemory(u8* buffer_ptr, GLintptr buffer_offset,
-                                                     Tegra::GPUVAddr gpu_addr, size_t size,
-                                                     size_t alignment = 4);
+    void SetupShaders();
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 1965ab7d5..86682d7cb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -7,6 +7,7 @@
 
 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "core/core.h"
@@ -52,14 +53,30 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format));
     params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
     params.unaligned_height = config.tic.Height();
+    params.target = SurfaceTargetFromTextureType(config.tic.texture_type);
+
+    switch (params.target) {
+    case SurfaceTarget::Texture1D:
+    case SurfaceTarget::Texture2D:
+        params.depth = 1;
+        break;
+    case SurfaceTarget::Texture3D:
+    case SurfaceTarget::Texture2DArray:
+        params.depth = config.tic.Depth();
+        break;
+    default:
+        LOG_CRITICAL(HW_GPU, "Unknown depth for target={}", static_cast<u32>(params.target));
+        UNREACHABLE();
+        params.depth = 1;
+        break;
+    }
+
     params.size_in_bytes = params.SizeInBytes();
-    params.cache_width = Common::AlignUp(params.width, 16);
-    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
-/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(
-    const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config) {
+/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(std::size_t index) {
+    const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]};
     SurfaceParams params{};
     params.addr = TryGetCpuAddr(config.Address());
     params.is_tiled = true;
@@ -70,9 +87,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.width = config.width;
     params.height = config.height;
     params.unaligned_height = config.height;
+    params.target = SurfaceTarget::Texture2D;
+    params.depth = 1;
     params.size_in_bytes = params.SizeInBytes();
-    params.cache_width = Common::AlignUp(params.width, 16);
-    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
@@ -86,13 +103,12 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.pixel_format = PixelFormatFromDepthFormat(format);
     params.component_type = ComponentTypeFromDepthFormat(format);
     params.type = GetFormatType(params.pixel_format);
-    params.size_in_bytes = params.SizeInBytes();
     params.width = zeta_width;
     params.height = zeta_height;
     params.unaligned_height = zeta_height;
+    params.target = SurfaceTarget::Texture2D;
+    params.depth = 1;
     params.size_in_bytes = params.SizeInBytes();
-    params.cache_width = Common::AlignUp(params.width, 16);
-    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
@@ -100,7 +116,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
     {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},   // ABGR8UI
-    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
+    {GL_RGB8, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},   // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10U
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U
@@ -151,6 +167,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                                // RG8S
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // RG32UI
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // R32UI
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8
 
     // Depth formats
     {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F
@@ -166,8 +183,28 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
      ComponentType::Float, false}, // Z32FS8
 }};
 
+static GLenum SurfaceTargetToGL(SurfaceParams::SurfaceTarget target) {
+    switch (target) {
+    case SurfaceParams::SurfaceTarget::Texture1D:
+        return GL_TEXTURE_1D;
+    case SurfaceParams::SurfaceTarget::Texture2D:
+        return GL_TEXTURE_2D;
+    case SurfaceParams::SurfaceTarget::Texture3D:
+        return GL_TEXTURE_3D;
+    case SurfaceParams::SurfaceTarget::Texture1DArray:
+        return GL_TEXTURE_1D_ARRAY;
+    case SurfaceParams::SurfaceTarget::Texture2DArray:
+        return GL_TEXTURE_2D_ARRAY;
+    case SurfaceParams::SurfaceTarget::TextureCubemap:
+        return GL_TEXTURE_CUBE_MAP;
+    }
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture target={}", static_cast<u32>(target));
+    UNREACHABLE();
+    return {};
+}
+
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
-    ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
+    ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
     auto& format = tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     ASSERT(component_type == format.component_type);
 
@@ -177,6 +214,7 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType
 static bool IsPixelFormatASTC(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8:
         return true;
     default:
         return false;
@@ -187,6 +225,8 @@ static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
         return {4, 4};
+    case PixelFormat::ASTC_2D_8X8:
+        return {8, 8};
     default:
         LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
         UNREACHABLE();
@@ -220,7 +260,8 @@ static bool IsFormatBCn(PixelFormat format) {
 }
 
 template <bool morton_to_gl, PixelFormat format>
-void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_buffer, VAddr addr) {
+void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, std::size_t gl_buffer_size,
+                VAddr addr) {
     constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
     constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
 
@@ -230,18 +271,18 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_bu
         const u32 tile_size{IsFormatBCn(format) ? 4U : 1U};
         const std::vector<u8> data = Tegra::Texture::UnswizzleTexture(
             addr, tile_size, bytes_per_pixel, stride, height, block_height);
-        const size_t size_to_copy{std::min(gl_buffer.size(), data.size())};
-        gl_buffer.assign(data.begin(), data.begin() + size_to_copy);
+        const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())};
+        memcpy(gl_buffer, data.data(), size_to_copy);
     } else {
         // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should
         // check the configuration for this and perform more generic un/swizzle
         LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
         VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel,
-                                       Memory::GetPointer(addr), gl_buffer.data(), morton_to_gl);
+                                       Memory::GetPointer(addr), gl_buffer, morton_to_gl);
     }
 }
 
-static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
         // clang-format off
@@ -290,6 +331,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
         MortonCopy<true, PixelFormat::RG8S>,
         MortonCopy<true, PixelFormat::RG32UI>,
         MortonCopy<true, PixelFormat::R32UI>,
+        MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
         MortonCopy<true, PixelFormat::Z32F>,
         MortonCopy<true, PixelFormat::Z16>,
         MortonCopy<true, PixelFormat::Z24S8>,
@@ -298,7 +340,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
         // clang-format on
 };
 
-static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr),
                             SurfaceParams::MaxPixelFormat>
     gl_to_morton_fns = {
         // clang-format off
@@ -349,6 +391,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
         MortonCopy<false, PixelFormat::RG8S>,
         MortonCopy<false, PixelFormat::RG32UI>,
         MortonCopy<false, PixelFormat::R32UI>,
+        nullptr,
         MortonCopy<false, PixelFormat::Z32F>,
         MortonCopy<false, PixelFormat::Z16>,
         MortonCopy<false, PixelFormat::Z24S8>,
@@ -357,33 +400,6 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr),
         // clang-format on
 };
 
-// Allocate an uninitialized texture of appropriate size and format for the surface
-static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tuple, u32 width,
-                                   u32 height) {
-    OpenGLState cur_state = OpenGLState::GetCurState();
-
-    // Keep track of previous texture bindings
-    GLuint old_tex = cur_state.texture_units[0].texture_2d;
-    cur_state.texture_units[0].texture_2d = texture;
-    cur_state.Apply();
-    glActiveTexture(GL_TEXTURE0);
-
-    if (!format_tuple.compressed) {
-        // Only pre-create the texture for non-compressed textures.
-        glTexImage2D(GL_TEXTURE_2D, 0, format_tuple.internal_format, width, height, 0,
-                     format_tuple.format, format_tuple.type, nullptr);
-    }
-
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-
-    // Restore previous texture bindings
-    cur_state.texture_units[0].texture_2d = old_tex;
-    cur_state.Apply();
-}
-
 static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex,
                          const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type,
                          GLuint read_fb_handle, GLuint draw_fb_handle) {
@@ -438,12 +454,53 @@ static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rec
     return true;
 }
 
-CachedSurface::CachedSurface(const SurfaceParams& params) : params(params) {
+CachedSurface::CachedSurface(const SurfaceParams& params)
+    : params(params), gl_target(SurfaceTargetToGL(params.target)) {
     texture.Create();
     const auto& rect{params.GetRect()};
-    AllocateSurfaceTexture(texture.handle,
-                           GetFormatTuple(params.pixel_format, params.component_type),
+
+    // Keep track of previous texture bindings
+    OpenGLState cur_state = OpenGLState::GetCurState();
+    const auto& old_tex = cur_state.texture_units[0];
+    SCOPE_EXIT({
+        cur_state.texture_units[0] = old_tex;
+        cur_state.Apply();
+    });
+
+    cur_state.texture_units[0].texture = texture.handle;
+    cur_state.texture_units[0].target = SurfaceTargetToGL(params.target);
+    cur_state.Apply();
+    glActiveTexture(GL_TEXTURE0);
+
+    const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type);
+    if (!format_tuple.compressed) {
+        // Only pre-create the texture for non-compressed textures.
+        switch (params.target) {
+        case SurfaceParams::SurfaceTarget::Texture1D:
+            glTexStorage1D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
+                           rect.GetWidth());
+            break;
+        case SurfaceParams::SurfaceTarget::Texture2D:
+            glTexStorage2D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
                            rect.GetWidth(), rect.GetHeight());
+            break;
+        case SurfaceParams::SurfaceTarget::Texture3D:
+        case SurfaceParams::SurfaceTarget::Texture2DArray:
+            glTexStorage3D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
+                           rect.GetWidth(), rect.GetHeight(), params.depth);
+            break;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
+                         static_cast<u32>(params.target));
+            UNREACHABLE();
+            glTexStorage2D(GL_TEXTURE_2D, 1, format_tuple.internal_format, rect.GetWidth(),
+                           rect.GetHeight());
+        }
+    }
+
+    glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 }
 
 static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
@@ -461,10 +518,10 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
 
     S8Z24 input_pixel{};
     Z24S8 output_pixel{};
-    const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)};
-    for (size_t y = 0; y < height; ++y) {
-        for (size_t x = 0; x < width; ++x) {
-            const size_t offset{bpp * (y * width + x)};
+    constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)};
+    for (std::size_t y = 0; y < height; ++y) {
+        for (std::size_t x = 0; x < width; ++x) {
+            const std::size_t offset{bpp * (y * width + x)};
             std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24));
             output_pixel.s8.Assign(input_pixel.s8);
             output_pixel.z24.Assign(input_pixel.z24);
@@ -474,10 +531,10 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
 }
 
 static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
-    const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)};
-    for (size_t y = 0; y < height; ++y) {
-        for (size_t x = 0; x < width; ++x) {
-            const size_t offset{bpp * (y * width + x)};
+    constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)};
+    for (std::size_t y = 0; y < height; ++y) {
+        for (std::size_t x = 0; x < width; ++x) {
+            const std::size_t offset{bpp * (y * width + x)};
             const u8 temp{data[offset]};
             data[offset] = data[offset + 1];
             data[offset + 1] = temp;
@@ -493,7 +550,8 @@ static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
 static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
                                                u32 width, u32 height) {
     switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4: {
+    case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8: {
         // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
         u32 block_width{};
         u32 block_height{};
@@ -514,23 +572,6 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma
     }
 }
 
-/**
- * Helper function to perform software conversion (as needed) when flushing a buffer to Switch
- * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with
- * typical desktop GPUs.
- */
-static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& /*data*/, PixelFormat pixel_format,
-                                                u32 /*width*/, u32 /*height*/) {
-    switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::S8Z24:
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented pixel_format={}",
-                     static_cast<u32>(pixel_format));
-        UNREACHABLE();
-        break;
-    }
-}
-
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192));
 void CachedSurface::LoadGLBuffer() {
     ASSERT(params.type != SurfaceType::Fill);
@@ -545,13 +586,25 @@ void CachedSurface::LoadGLBuffer() {
     MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
 
     if (params.is_tiled) {
-        gl_buffer.resize(copy_size);
+        // TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do
+        // this for 3D textures, etc.
+        switch (params.target) {
+        case SurfaceParams::SurfaceTarget::Texture2D:
+            // Pass impl. to the fallback code below
+            break;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented tiled load for target={}",
+                         static_cast<u32>(params.target));
+            UNREACHABLE();
+        }
 
-        morton_to_gl_fns[static_cast<size_t>(params.pixel_format)](
-            params.width, params.block_height, params.height, gl_buffer, params.addr);
+        gl_buffer.resize(static_cast<std::size_t>(params.depth) * copy_size);
+        morton_to_gl_fns[static_cast<std::size_t>(params.pixel_format)](
+            params.width, params.block_height, params.height, gl_buffer.data(), copy_size,
+            params.addr);
     } else {
-        const u8* const texture_src_data_end = texture_src_data + copy_size;
-
+        const u8* const texture_src_data_end{texture_src_data +
+                                             (static_cast<std::size_t>(params.depth) * copy_size)};
         gl_buffer.assign(texture_src_data, texture_src_data_end);
     }
 
@@ -560,23 +613,7 @@ void CachedSurface::LoadGLBuffer() {
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64));
 void CachedSurface::FlushGLBuffer() {
-    u8* const dst_buffer = Memory::GetPointer(params.addr);
-
-    ASSERT(dst_buffer);
-    ASSERT(gl_buffer.size() ==
-           params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
-
-    MICROPROFILE_SCOPE(OpenGL_SurfaceFlush);
-
-    ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width,
-                                        params.height);
-
-    if (!params.is_tiled) {
-        std::memcpy(dst_buffer, gl_buffer.data(), params.size_in_bytes);
-    } else {
-        gl_to_morton_fns[static_cast<size_t>(params.pixel_format)](
-            params.width, params.block_height, params.height, gl_buffer, params.addr);
-    }
+    ASSERT_MSG(false, "Unimplemented");
 }
 
 MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192));
@@ -586,22 +623,30 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle
 
     MICROPROFILE_SCOPE(OpenGL_TextureUL);
 
-    ASSERT(gl_buffer.size() ==
-           params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
+    ASSERT(gl_buffer.size() == static_cast<std::size_t>(params.width) * params.height *
+                                   GetGLBytesPerPixel(params.pixel_format) * params.depth);
 
     const auto& rect{params.GetRect()};
 
     // Load data from memory to the surface
-    GLint x0 = static_cast<GLint>(rect.left);
-    GLint y0 = static_cast<GLint>(rect.bottom);
-    size_t buffer_offset = (y0 * params.width + x0) * GetGLBytesPerPixel(params.pixel_format);
+    const GLint x0 = static_cast<GLint>(rect.left);
+    const GLint y0 = static_cast<GLint>(rect.bottom);
+    const std::size_t buffer_offset =
+        static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.width +
+                                 static_cast<std::size_t>(x0)) *
+        GetGLBytesPerPixel(params.pixel_format);
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
-    GLuint target_tex = texture.handle;
+    const GLuint target_tex = texture.handle;
     OpenGLState cur_state = OpenGLState::GetCurState();
 
-    GLuint old_tex = cur_state.texture_units[0].texture_2d;
-    cur_state.texture_units[0].texture_2d = target_tex;
+    const auto& old_tex = cur_state.texture_units[0];
+    SCOPE_EXIT({
+        cur_state.texture_units[0] = old_tex;
+        cur_state.Apply();
+    });
+    cur_state.texture_units[0].texture = target_tex;
+    cur_state.texture_units[0].target = SurfaceTargetToGL(params.target);
     cur_state.Apply();
 
     // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
@@ -610,136 +655,102 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle
 
     glActiveTexture(GL_TEXTURE0);
     if (tuple.compressed) {
-        glCompressedTexImage2D(
-            GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width),
-            static_cast<GLsizei>(params.height), 0, static_cast<GLsizei>(params.size_in_bytes),
-            &gl_buffer[buffer_offset]);
+        switch (params.target) {
+        case SurfaceParams::SurfaceTarget::Texture2D:
+            glCompressedTexImage2D(
+                SurfaceTargetToGL(params.target), 0, tuple.internal_format,
+                static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0,
+                static_cast<GLsizei>(params.size_in_bytes), &gl_buffer[buffer_offset]);
+            break;
+        case SurfaceParams::SurfaceTarget::Texture3D:
+        case SurfaceParams::SurfaceTarget::Texture2DArray:
+            glCompressedTexImage3D(
+                SurfaceTargetToGL(params.target), 0, tuple.internal_format,
+                static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height),
+                static_cast<GLsizei>(params.depth), 0, static_cast<GLsizei>(params.size_in_bytes),
+                &gl_buffer[buffer_offset]);
+            break;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
+                         static_cast<u32>(params.target));
+            UNREACHABLE();
+            glCompressedTexImage2D(
+                GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width),
+                static_cast<GLsizei>(params.height), 0, static_cast<GLsizei>(params.size_in_bytes),
+                &gl_buffer[buffer_offset]);
+        }
     } else {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
-                        static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
-                        &gl_buffer[buffer_offset]);
-    }
-
-    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-
-    cur_state.texture_units[0].texture_2d = old_tex;
-    cur_state.Apply();
-}
-
-MICROPROFILE_DEFINE(OpenGL_TextureDL, "OpenGL", "Texture Download", MP_RGB(128, 192, 64));
-void CachedSurface::DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) {
-    if (params.type == SurfaceType::Fill)
-        return;
-
-    MICROPROFILE_SCOPE(OpenGL_TextureDL);
-
-    gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
-
-    OpenGLState state = OpenGLState::GetCurState();
-    OpenGLState prev_state = state;
-    SCOPE_EXIT({ prev_state.Apply(); });
-
-    const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
-
-    // Ensure no bad interactions with GL_PACK_ALIGNMENT
-    ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0);
-    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width));
-
-    const auto& rect{params.GetRect()};
-    size_t buffer_offset =
-        (rect.bottom * params.width + rect.left) * GetGLBytesPerPixel(params.pixel_format);
-
-    state.UnbindTexture(texture.handle);
-    state.draw.read_framebuffer = read_fb_handle;
-    state.Apply();
 
-    if (params.type == SurfaceType::ColorTexture) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
-                               texture.handle, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-    } else if (params.type == SurfaceType::Depth) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                               texture.handle, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-    } else {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                               texture.handle, 0);
+        switch (params.target) {
+        case SurfaceParams::SurfaceTarget::Texture1D:
+            glTexSubImage1D(SurfaceTargetToGL(params.target), 0, x0,
+                            static_cast<GLsizei>(rect.GetWidth()), tuple.format, tuple.type,
+                            &gl_buffer[buffer_offset]);
+            break;
+        case SurfaceParams::SurfaceTarget::Texture2D:
+            glTexSubImage2D(SurfaceTargetToGL(params.target), 0, x0, y0,
+                            static_cast<GLsizei>(rect.GetWidth()),
+                            static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
+                            &gl_buffer[buffer_offset]);
+            break;
+        case SurfaceParams::SurfaceTarget::Texture3D:
+        case SurfaceParams::SurfaceTarget::Texture2DArray:
+            glTexSubImage3D(SurfaceTargetToGL(params.target), 0, x0, y0, 0,
+                            static_cast<GLsizei>(rect.GetWidth()),
+                            static_cast<GLsizei>(rect.GetHeight()), params.depth, tuple.format,
+                            tuple.type, &gl_buffer[buffer_offset]);
+            break;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
+                         static_cast<u32>(params.target));
+            UNREACHABLE();
+            glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
+                            static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
+                            &gl_buffer[buffer_offset]);
+        }
     }
-    glReadPixels(static_cast<GLint>(rect.left), static_cast<GLint>(rect.bottom),
-                 static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()),
-                 tuple.format, tuple.type, &gl_buffer[buffer_offset]);
 
-    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 }
 
 RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
     read_framebuffer.Create();
     draw_framebuffer.Create();
+    copy_pbo.Create();
 }
 
 Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextureInfo& config) {
     return GetSurface(SurfaceParams::CreateForTexture(config));
 }
 
-SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(bool using_color_fb,
-                                                                       bool using_depth_fb,
-                                                                       bool preserve_contents) {
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) {
+    const auto& regs{Core::System::GetInstance().GPU().Maxwell3D().regs};
+    if (!regs.zeta.Address() || !regs.zeta_enable) {
+        return {};
+    }
 
-    // TODO(bunnei): This is hard corded to use just the first render buffer
-    LOG_WARNING(Render_OpenGL, "hard-coded for render target 0!");
+    SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer(
+        regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format)};
 
-    // get color and depth surfaces
-    SurfaceParams color_params{};
-    SurfaceParams depth_params{};
+    return GetSurface(depth_params, preserve_contents);
+}
 
-    if (using_color_fb) {
-        color_params = SurfaceParams::CreateForFramebuffer(regs.rt[0]);
-    }
+Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool preserve_contents) {
+    const auto& regs{Core::System::GetInstance().GPU().Maxwell3D().regs};
 
-    if (using_depth_fb) {
-        depth_params = SurfaceParams::CreateForDepthBuffer(regs.zeta_width, regs.zeta_height,
-                                                           regs.zeta.Address(), regs.zeta.format);
-    }
+    ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
 
-    MathUtil::Rectangle<u32> color_rect{};
-    Surface color_surface;
-    if (using_color_fb) {
-        color_surface = GetSurface(color_params, preserve_contents);
-        if (color_surface) {
-            color_rect = color_surface->GetSurfaceParams().GetRect();
-        }
+    if (index >= regs.rt_control.count) {
+        return {};
     }
 
-    MathUtil::Rectangle<u32> depth_rect{};
-    Surface depth_surface;
-    if (using_depth_fb) {
-        depth_surface = GetSurface(depth_params, preserve_contents);
-        if (depth_surface) {
-            depth_rect = depth_surface->GetSurfaceParams().GetRect();
-        }
+    if (regs.rt[index].Address() == 0 || regs.rt[index].format == Tegra::RenderTargetFormat::NONE) {
+        return {};
     }
 
-    MathUtil::Rectangle<u32> fb_rect{};
-    if (color_surface && depth_surface) {
-        fb_rect = color_rect;
-        // Color and Depth surfaces must have the same dimensions and offsets
-        if (color_rect.bottom != depth_rect.bottom || color_rect.top != depth_rect.top ||
-            color_rect.left != depth_rect.left || color_rect.right != depth_rect.right) {
-            color_surface = GetSurface(color_params);
-            depth_surface = GetSurface(depth_params);
-            fb_rect = color_surface->GetSurfaceParams().GetRect();
-        }
-    } else if (color_surface) {
-        fb_rect = color_rect;
-    } else if (depth_surface) {
-        fb_rect = depth_rect;
-    }
+    const SurfaceParams color_params{SurfaceParams::CreateForFramebuffer(index)};
 
-    return std::make_tuple(color_surface, depth_surface, fb_rect);
+    return GetSurface(color_params, preserve_contents);
 }
 
 void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
@@ -748,7 +759,6 @@ void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
 }
 
 void RasterizerCacheOpenGL::FlushSurface(const Surface& surface) {
-    surface->DownloadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
     surface->FlushGLBuffer();
 }
 
@@ -806,27 +816,26 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
     // Get a new surface with the new parameters, and blit the previous surface to it
     Surface new_surface{GetUncachedSurface(new_params)};
 
-    // If format is unchanged, we can do a faster blit without reinterpreting pixel data
-    if (params.pixel_format == new_params.pixel_format) {
+    if (params.pixel_format == new_params.pixel_format ||
+        !Settings::values.use_accurate_framebuffers) {
+        // If the format is the same, just do a framebuffer blit. This is significantly faster than
+        // using PBOs. The is also likely less accurate, as textures will be converted rather than
+        // reinterpreted.
+
         BlitTextures(surface->Texture().handle, params.GetRect(), new_surface->Texture().handle,
-                     new_surface->GetSurfaceParams().GetRect(), params.type,
-                     read_framebuffer.handle, draw_framebuffer.handle);
-        return new_surface;
-    }
+                     params.GetRect(), params.type, read_framebuffer.handle,
+                     draw_framebuffer.handle);
+    } else {
+        // When use_accurate_framebuffers setting is enabled, perform a more accurate surface copy,
+        // where pixels are reinterpreted as a new format (without conversion). This code path uses
+        // OpenGL PBOs and is quite slow.
 
-    // When using accurate framebuffers, always copy old data to new surface, regardless of format
-    if (Settings::values.use_accurate_framebuffers) {
         auto source_format = GetFormatTuple(params.pixel_format, params.component_type);
         auto dest_format = GetFormatTuple(new_params.pixel_format, new_params.component_type);
 
-        size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes());
+        std::size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes());
 
-        // Use a Pixel Buffer Object to download the previous texture and then upload it to the new
-        // one using the new format.
-        OGLBuffer pbo;
-        pbo.Create();
-
-        glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo.handle);
+        glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo.handle);
         glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW_ARB);
         if (source_format.compressed) {
             glGetCompressedTextureImage(surface->Texture().handle, 0,
@@ -845,10 +854,10 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
                 // of the data in this case. Games like Super Mario Odyssey seem to hit this case
                 // when drawing, it re-uses the memory of a previous texture as a bigger framebuffer
                 // but it doesn't clear it beforehand, the texture is already full of zeros.
-                LOG_CRITICAL(HW_GPU, "Trying to upload extra texture data from the CPU during "
-                                     "reinterpretation but the texture is tiled.");
+                LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during "
+                                  "reinterpretation but the texture is tiled.");
             }
-            size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes();
+            std::size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes();
             std::vector<u8> data(remaining_size);
             Memory::ReadBlock(new_params.addr + params.SizeInBytes(), data.data(), data.size());
             glBufferSubData(GL_PIXEL_PACK_BUFFER, params.SizeInBytes(), remaining_size,
@@ -859,21 +868,38 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
 
         const auto& dest_rect{new_params.GetRect()};
 
-        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo.handle);
+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo.handle);
         if (dest_format.compressed) {
-            glCompressedTexSubImage2D(
-                GL_TEXTURE_2D, 0, 0, 0, static_cast<GLsizei>(dest_rect.GetWidth()),
-                static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format,
-                static_cast<GLsizei>(new_params.SizeInBytes()), nullptr);
+            LOG_CRITICAL(HW_GPU, "Compressed copy is unimplemented!");
+            UNREACHABLE();
         } else {
-            glTextureSubImage2D(new_surface->Texture().handle, 0, 0, 0,
-                                static_cast<GLsizei>(dest_rect.GetWidth()),
-                                static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format,
-                                dest_format.type, nullptr);
+            switch (new_params.target) {
+            case SurfaceParams::SurfaceTarget::Texture1D:
+                glTextureSubImage1D(new_surface->Texture().handle, 0, 0,
+                                    static_cast<GLsizei>(dest_rect.GetWidth()), dest_format.format,
+                                    dest_format.type, nullptr);
+                break;
+            case SurfaceParams::SurfaceTarget::Texture2D:
+                glTextureSubImage2D(new_surface->Texture().handle, 0, 0, 0,
+                                    static_cast<GLsizei>(dest_rect.GetWidth()),
+                                    static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format,
+                                    dest_format.type, nullptr);
+                break;
+            case SurfaceParams::SurfaceTarget::Texture3D:
+            case SurfaceParams::SurfaceTarget::Texture2DArray:
+                glTextureSubImage3D(new_surface->Texture().handle, 0, 0, 0, 0,
+                                    static_cast<GLsizei>(dest_rect.GetWidth()),
+                                    static_cast<GLsizei>(dest_rect.GetHeight()),
+                                    static_cast<GLsizei>(new_params.depth), dest_format.format,
+                                    dest_format.type, nullptr);
+                break;
+            default:
+                LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
+                             static_cast<u32>(params.target));
+                UNREACHABLE();
+            }
         }
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-        pbo.Release();
     }
 
     return new_surface;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index aad75f200..d7a4bc37f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -70,19 +70,20 @@ struct SurfaceParams {
         RG8S = 42,
         RG32UI = 43,
         R32UI = 44,
+        ASTC_2D_8X8 = 45,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 45,
-        Z16 = 46,
+        Z32F = 46,
+        Z16 = 47,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 47,
-        S8Z24 = 48,
-        Z32FS8 = 49,
+        Z24S8 = 48,
+        S8Z24 = 49,
+        Z32FS8 = 50,
 
         MaxDepthStencilFormat,
 
@@ -90,7 +91,7 @@ struct SurfaceParams {
         Invalid = 255,
     };
 
-    static constexpr size_t MaxPixelFormat = static_cast<size_t>(PixelFormat::Max);
+    static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max);
 
     enum class ComponentType {
         Invalid = 0,
@@ -109,6 +110,33 @@ struct SurfaceParams {
         Invalid = 4,
     };
 
+    enum class SurfaceTarget {
+        Texture1D,
+        Texture2D,
+        Texture3D,
+        Texture1DArray,
+        Texture2DArray,
+        TextureCubemap,
+    };
+
+    static SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type) {
+        switch (texture_type) {
+        case Tegra::Texture::TextureType::Texture1D:
+            return SurfaceTarget::Texture1D;
+        case Tegra::Texture::TextureType::Texture2D:
+        case Tegra::Texture::TextureType::Texture2DNoMipmap:
+            return SurfaceTarget::Texture2D;
+        case Tegra::Texture::TextureType::Texture1DArray:
+            return SurfaceTarget::Texture1DArray;
+        case Tegra::Texture::TextureType::Texture2DArray:
+            return SurfaceTarget::Texture2DArray;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type));
+            UNREACHABLE();
+            return SurfaceTarget::Texture2D;
+        }
+    }
+
     /**
      * Gets the compression factor for the specified PixelFormat. This applies to just the
      * "compressed width" and "compressed height", not the overall compression factor of a
@@ -165,6 +193,7 @@ struct SurfaceParams {
             1, // RG8S
             1, // RG32UI
             1, // R32UI
+            4, // ASTC_2D_8X8
             1, // Z32F
             1, // Z16
             1, // Z24S8
@@ -172,8 +201,8 @@ struct SurfaceParams {
             1, // Z32FS8
         }};
 
-        ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
-        return compression_factor_table[static_cast<size_t>(format)];
+        ASSERT(static_cast<std::size_t>(format) < compression_factor_table.size());
+        return compression_factor_table[static_cast<std::size_t>(format)];
     }
 
     static constexpr u32 GetFormatBpp(PixelFormat format) {
@@ -226,6 +255,7 @@ struct SurfaceParams {
             16,  // RG8S
             64,  // RG32UI
             32,  // R32UI
+            16,  // ASTC_2D_8X8
             32,  // Z32F
             16,  // Z16
             32,  // Z24S8
@@ -233,8 +263,8 @@ struct SurfaceParams {
             64,  // Z32FS8
         }};
 
-        ASSERT(static_cast<size_t>(format) < bpp_table.size());
-        return bpp_table[static_cast<size_t>(format)];
+        ASSERT(static_cast<std::size_t>(format) < bpp_table.size());
+        return bpp_table[static_cast<std::size_t>(format)];
     }
 
     u32 GetFormatBpp() const {
@@ -270,6 +300,7 @@ struct SurfaceParams {
             return PixelFormat::ABGR8S;
         case Tegra::RenderTargetFormat::RGBA8_UINT:
             return PixelFormat::ABGR8UI;
+        case Tegra::RenderTargetFormat::BGRA8_SRGB:
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
@@ -288,6 +319,8 @@ struct SurfaceParams {
             return PixelFormat::R11FG11FB10F;
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
             return PixelFormat::B5G6R5U;
+        case Tegra::RenderTargetFormat::BGR5A1_UNORM:
+            return PixelFormat::A1B5G5R5U;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
             return PixelFormat::RGBA32UI;
         case Tegra::RenderTargetFormat::R8_UNORM:
@@ -494,6 +527,8 @@ struct SurfaceParams {
             return PixelFormat::BC6H_SF16;
         case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
             return PixelFormat::ASTC_2D_4X4;
+        case Tegra::Texture::TextureFormat::ASTC_2D_8X8:
+            return PixelFormat::ASTC_2D_8X8;
         case Tegra::Texture::TextureFormat::R16_G16:
             switch (component_type) {
             case Tegra::Texture::ComponentType::FLOAT:
@@ -542,11 +577,13 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RGBA8_UNORM:
         case Tegra::RenderTargetFormat::RGBA8_SRGB:
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
+        case Tegra::RenderTargetFormat::BGRA8_SRGB:
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
         case Tegra::RenderTargetFormat::R8_UNORM:
         case Tegra::RenderTargetFormat::RG16_UNORM:
         case Tegra::RenderTargetFormat::R16_UNORM:
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+        case Tegra::RenderTargetFormat::BGR5A1_UNORM:
         case Tegra::RenderTargetFormat::RG8_UNORM:
         case Tegra::RenderTargetFormat::RGBA16_UNORM:
             return ComponentType::UNorm;
@@ -607,16 +644,18 @@ struct SurfaceParams {
     }
 
     static SurfaceType GetFormatType(PixelFormat pixel_format) {
-        if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxColorFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxColorFormat)) {
             return SurfaceType::ColorTexture;
         }
 
-        if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxDepthFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) {
             return SurfaceType::Depth;
         }
 
-        if (static_cast<size_t>(pixel_format) <
-            static_cast<size_t>(PixelFormat::MaxDepthStencilFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) {
             return SurfaceType::DepthStencil;
         }
 
@@ -630,20 +669,19 @@ struct SurfaceParams {
     MathUtil::Rectangle<u32> GetRect() const;
 
     /// Returns the size of this surface in bytes, adjusted for compression
-    size_t SizeInBytes() const {
+    std::size_t SizeInBytes() const {
         const u32 compression_factor{GetCompressionFactor(pixel_format)};
         ASSERT(width % compression_factor == 0);
         ASSERT(height % compression_factor == 0);
         return (width / compression_factor) * (height / compression_factor) *
-               GetFormatBpp(pixel_format) / CHAR_BIT;
+               GetFormatBpp(pixel_format) * depth / CHAR_BIT;
     }
 
     /// Creates SurfaceParams from a texture configuration
     static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config);
 
     /// Creates SurfaceParams from a framebuffer configuration
-    static SurfaceParams CreateForFramebuffer(
-        const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config);
+    static SurfaceParams CreateForFramebuffer(std::size_t index);
 
     /// Creates SurfaceParams for a depth buffer configuration
     static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
@@ -652,8 +690,8 @@ struct SurfaceParams {
 
     /// Checks if surfaces are compatible for caching
     bool IsCompatibleSurface(const SurfaceParams& other) const {
-        return std::tie(pixel_format, type, cache_width, cache_height) ==
-               std::tie(other.pixel_format, other.type, other.cache_width, other.cache_height);
+        return std::tie(pixel_format, type, width, height) ==
+               std::tie(other.pixel_format, other.type, other.width, other.height);
     }
 
     VAddr addr;
@@ -664,12 +702,10 @@ struct SurfaceParams {
     SurfaceType type;
     u32 width;
     u32 height;
+    u32 depth;
     u32 unaligned_height;
-    size_t size_in_bytes;
-
-    // Parameters used for caching only
-    u32 cache_width;
-    u32 cache_height;
+    std::size_t size_in_bytes;
+    SurfaceTarget target;
 };
 
 }; // namespace OpenGL
@@ -685,7 +721,7 @@ struct SurfaceReserveKey : Common::HashableStruct<OpenGL::SurfaceParams> {
 namespace std {
 template <>
 struct hash<SurfaceReserveKey> {
-    size_t operator()(const SurfaceReserveKey& k) const {
+    std::size_t operator()(const SurfaceReserveKey& k) const {
         return k.Hash();
     }
 };
@@ -701,7 +737,7 @@ public:
         return params.addr;
     }
 
-    size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const {
         return params.size_in_bytes;
     }
 
@@ -709,6 +745,10 @@ public:
         return texture;
     }
 
+    GLenum Target() const {
+        return gl_target;
+    }
+
     static constexpr unsigned int GetGLBytesPerPixel(SurfaceParams::PixelFormat format) {
         if (format == SurfaceParams::PixelFormat::Invalid)
             return 0;
@@ -724,14 +764,14 @@ public:
     void LoadGLBuffer();
     void FlushGLBuffer();
 
-    // Upload/Download data in gl_buffer in/to this surface's texture
+    // Upload data in gl_buffer to this surface's texture
     void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
-    void DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
 
 private:
     OGLTexture texture;
     std::vector<u8> gl_buffer;
     SurfaceParams params;
+    GLenum gl_target;
 };
 
 class RasterizerCacheOpenGL final : public RasterizerCache<Surface> {
@@ -741,9 +781,11 @@ public:
     /// Get a surface based on the texture configuration
     Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config);
 
-    /// Get the color and depth surfaces based on the framebuffer configuration
-    SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb,
-                                                    bool preserve_contents);
+    /// Get the depth surface based on the framebuffer configuration
+    Surface GetDepthBufferSurface(bool preserve_contents);
+
+    /// Get the color surface based on the framebuffer configuration and the specified render target
+    Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);
 
     /// Flushes the surface to Switch memory
     void FlushSurface(const Surface& surface);
@@ -774,6 +816,10 @@ private:
 
     OGLFramebuffer read_framebuffer;
     OGLFramebuffer draw_framebuffer;
+
+    /// Use a Pixel Buffer Object to download the previous texture and then upload it to the new one
+    /// using the new format.
+    OGLBuffer copy_pbo;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index ac9adfd83..894fe6eae 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -13,8 +13,8 @@ namespace OpenGL {
 
 /// Gets the address for the specified shader stage program
 static VAddr GetShaderAddress(Maxwell::ShaderProgram program) {
-    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
-    auto& shader_config = gpu.regs.shader_config[static_cast<size_t>(program)];
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& shader_config = gpu.regs.shader_config[static_cast<std::size_t>(program)];
     return *gpu.memory_manager.GpuToCpuAddress(gpu.regs.code_address.CodeAddress() +
                                                shader_config.offset);
 }
@@ -28,7 +28,7 @@ static GLShader::ProgramCode GetShaderCode(VAddr addr) {
 
 /// Helper function to set shader uniform block bindings for a single shader stage
 static void SetShaderUniformBlockBinding(GLuint shader, const char* name,
-                                         Maxwell::ShaderStage binding, size_t expected_size) {
+                                         Maxwell::ShaderStage binding, std::size_t expected_size) {
     const GLuint ub_index = glGetUniformBlockIndex(shader, name);
     if (ub_index == GL_INVALID_INDEX) {
         return;
@@ -36,7 +36,7 @@ static void SetShaderUniformBlockBinding(GLuint shader, const char* name,
 
     GLint ub_size = 0;
     glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size);
-    ASSERT_MSG(static_cast<size_t>(ub_size) == expected_size,
+    ASSERT_MSG(static_cast<std::size_t>(ub_size) == expected_size,
                "Uniform block size did not match! Got {}, expected {}", ub_size, expected_size);
     glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding));
 }
@@ -85,23 +85,23 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type)
     SetShaderUniformBlockBindings(program.handle);
 }
 
-GLuint CachedShader::GetProgramResourceIndex(const std::string& name) {
-    auto search{resource_cache.find(name)};
+GLuint CachedShader::GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer) {
+    const auto search{resource_cache.find(buffer.GetHash())};
     if (search == resource_cache.end()) {
         const GLuint index{
-            glGetProgramResourceIndex(program.handle, GL_UNIFORM_BLOCK, name.c_str())};
-        resource_cache[name] = index;
+            glGetProgramResourceIndex(program.handle, GL_UNIFORM_BLOCK, buffer.GetName().c_str())};
+        resource_cache[buffer.GetHash()] = index;
         return index;
     }
 
     return search->second;
 }
 
-GLint CachedShader::GetUniformLocation(const std::string& name) {
-    auto search{uniform_cache.find(name)};
+GLint CachedShader::GetUniformLocation(const GLShader::SamplerEntry& sampler) {
+    const auto search{uniform_cache.find(sampler.GetHash())};
     if (search == uniform_cache.end()) {
-        const GLint index{glGetUniformLocation(program.handle, name.c_str())};
-        uniform_cache[name] = index;
+        const GLint index{glGetUniformLocation(program.handle, sampler.GetName().c_str())};
+        uniform_cache[sampler.GetHash()] = index;
         return index;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 759987604..9bafe43a9 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -4,8 +4,8 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
-#include <unordered_map>
 
 #include "common/common_types.h"
 #include "video_core/rasterizer_cache.h"
@@ -28,7 +28,7 @@ public:
     }
 
     /// Gets the size of the shader in guest memory, required for cache management
-    size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const {
         return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64);
     }
 
@@ -43,10 +43,10 @@ public:
     }
 
     /// Gets the GL program resource location for the specified resource, caching as needed
-    GLuint GetProgramResourceIndex(const std::string& name);
+    GLuint GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer);
 
     /// Gets the GL uniform location for the specified resource, caching as needed
-    GLint GetUniformLocation(const std::string& name);
+    GLint GetUniformLocation(const GLShader::SamplerEntry& sampler);
 
 private:
     VAddr addr;
@@ -55,8 +55,8 @@ private:
     GLShader::ShaderEntries entries;
     OGLProgram program;
 
-    std::unordered_map<std::string, GLuint> resource_cache;
-    std::unordered_map<std::string, GLint> uniform_cache;
+    std::map<u32, GLuint> resource_cache;
+    std::map<u32, GLint> uniform_cache;
 };
 
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 391c92d47..b3e95187e 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -12,6 +12,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/engines/shader_header.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 
@@ -26,7 +27,7 @@ using Tegra::Shader::Sampler;
 using Tegra::Shader::SubOp;
 
 constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH;
-constexpr u32 PROGRAM_HEADER_SIZE = 0x50;
+constexpr u32 PROGRAM_HEADER_SIZE = sizeof(Tegra::Shader::Header);
 
 class DecompileFail : public std::runtime_error {
 public:
@@ -113,7 +114,7 @@ private:
 
     /// Scans a range of code for labels and determines the exit method.
     ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels) {
-        auto [iter, inserted] =
+        const auto [iter, inserted] =
             exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
         ExitMethod& exit_method = iter->second;
         if (!inserted)
@@ -131,22 +132,22 @@ private:
                     if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
                         return exit_method = ExitMethod::AlwaysEnd;
                     } else {
-                        ExitMethod not_met = Scan(offset + 1, end, labels);
+                        const ExitMethod not_met = Scan(offset + 1, end, labels);
                         return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met);
                     }
                 }
                 case OpCode::Id::BRA: {
-                    u32 target = offset + instr.bra.GetBranchTarget();
+                    const u32 target = offset + instr.bra.GetBranchTarget();
                     labels.insert(target);
-                    ExitMethod no_jmp = Scan(offset + 1, end, labels);
-                    ExitMethod jmp = Scan(target, end, labels);
+                    const ExitMethod no_jmp = Scan(offset + 1, end, labels);
+                    const ExitMethod jmp = Scan(target, end, labels);
                     return exit_method = ParallelExit(no_jmp, jmp);
                 }
                 case OpCode::Id::SSY: {
                     // The SSY instruction uses a similar encoding as the BRA instruction.
                     ASSERT_MSG(instr.bra.constant_buffer == 0,
                                "Constant buffer SSY is not supported");
-                    u32 target = offset + instr.bra.GetBranchTarget();
+                    const u32 target = offset + instr.bra.GetBranchTarget();
                     labels.insert(target);
                     // Continue scanning for an exit method.
                     break;
@@ -189,7 +190,7 @@ public:
 
 private:
     void AppendIndentation() {
-        shader_source.append(static_cast<size_t>(scope) * 4, ' ');
+        shader_source.append(static_cast<std::size_t>(scope) * 4, ' ');
     }
 
     std::string shader_source;
@@ -208,7 +209,7 @@ public:
         UnsignedInteger,
     };
 
-    GLSLRegister(size_t index, const std::string& suffix) : index{index}, suffix{suffix} {}
+    GLSLRegister(std::size_t index, const std::string& suffix) : index{index}, suffix{suffix} {}
 
     /// Gets the GLSL type string for a register
     static std::string GetTypeString() {
@@ -226,15 +227,23 @@ public:
     }
 
     /// Returns the index of the register
-    size_t GetIndex() const {
+    std::size_t GetIndex() const {
         return index;
     }
 
 private:
-    const size_t index;
+    const std::size_t index;
     const std::string& suffix;
 };
 
+enum class InternalFlag : u64 {
+    ZeroFlag = 0,
+    CarryFlag = 1,
+    OverflowFlag = 2,
+    NaNFlag = 3,
+    Amount
+};
+
 /**
  * Used to manage shader registers that are emulated with GLSL. This class keeps track of the state
  * of all registers (e.g. whether they are currently being used as Floats or Integers), and
@@ -247,6 +256,7 @@ public:
                         const Maxwell3D::Regs::ShaderStage& stage, const std::string& suffix)
         : shader{shader}, declarations{declarations}, stage{stage}, suffix{suffix} {
         BuildRegisterList();
+        BuildInputList();
     }
 
     /**
@@ -327,13 +337,19 @@ public:
     void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                               const std::string& value, u64 dest_num_components,
                               u64 value_num_components, bool is_saturated = false,
-                              u64 dest_elem = 0, Register::Size size = Register::Size::Word) {
+                              u64 dest_elem = 0, Register::Size size = Register::Size::Word,
+                              bool sets_cc = false) {
         ASSERT_MSG(!is_saturated, "Unimplemented");
 
         const std::string func{is_signed ? "intBitsToFloat" : "uintBitsToFloat"};
 
         SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')',
                     dest_num_components, value_num_components, dest_elem);
+
+        if (sets_cc) {
+            const std::string zero_condition = "( " + ConvertIntegerSize(value, size) + " == 0 )";
+            SetInternalFlag(InternalFlag::ZeroFlag, zero_condition);
+        }
     }
 
     /**
@@ -343,12 +359,33 @@ public:
      * @param elem The element to use for the operation.
      * @param attribute The input attribute to use as the source value.
      */
-    void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute) {
-        std::string dest = GetRegisterAsFloat(reg);
-        std::string src = GetInputAttribute(attribute) + GetSwizzle(elem);
+    void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute,
+                                    const Tegra::Shader::IpaMode& input_mode) {
+        const std::string dest = GetRegisterAsFloat(reg);
+        const std::string src = GetInputAttribute(attribute, input_mode) + GetSwizzle(elem);
         shader.AddLine(dest + " = " + src + ';');
     }
 
+    std::string GetControlCode(const Tegra::Shader::ControlCode cc) const {
+        switch (cc) {
+        case Tegra::Shader::ControlCode::NEU:
+            return "!(" + GetInternalFlag(InternalFlag::ZeroFlag) + ')';
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented Control Code {}", static_cast<u32>(cc));
+            UNREACHABLE();
+            return "false";
+        }
+    }
+
+    std::string GetInternalFlag(const InternalFlag ii) const {
+        const u32 code = static_cast<u32>(ii);
+        return "internalFlag_" + std::to_string(code) + suffix;
+    }
+
+    void SetInternalFlag(const InternalFlag ii, const std::string& value) const {
+        shader.AddLine(GetInternalFlag(ii) + " = " + value + ';');
+    }
+
     /**
      * Writes code that does a output attribute assignment to register operation. Output attributes
      * are stored as floats, so this may require conversion.
@@ -357,8 +394,8 @@ public:
      * @param reg The register to use as the source value.
      */
     void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& reg) {
-        std::string dest = GetOutputAttribute(attribute);
-        std::string src = GetRegisterAsFloat(reg);
+        const std::string dest = GetOutputAttribute(attribute);
+        const std::string src = GetRegisterAsFloat(reg);
 
         if (!dest.empty()) {
             // Can happen with unknown/unimplemented output attributes, in which case we ignore the
@@ -391,9 +428,9 @@ public:
                                    GLSLRegister::Type type) {
         declr_const_buffers[cbuf_index].MarkAsUsedIndirect(cbuf_index, stage);
 
-        std::string final_offset = fmt::format("({} + {})", index_str, offset / 4);
-        std::string value = 'c' + std::to_string(cbuf_index) + '[' + final_offset + " / 4][" +
-                            final_offset + " % 4]";
+        const std::string final_offset = fmt::format("({} + {})", index_str, offset / 4);
+        const std::string value = 'c' + std::to_string(cbuf_index) + '[' + final_offset + " / 4][" +
+                                  final_offset + " % 4]";
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -412,12 +449,19 @@ public:
         }
         declarations.AddNewLine();
 
-        for (const auto& index : declr_input_attribute) {
+        for (u32 ii = 0; ii < static_cast<u64>(InternalFlag::Amount); ii++) {
+            const InternalFlag code = static_cast<InternalFlag>(ii);
+            declarations.AddLine("bool " + GetInternalFlag(code) + " = false;");
+        }
+        declarations.AddNewLine();
+
+        for (const auto element : declr_input_attribute) {
             // TODO(bunnei): Use proper number of elements for these
-            declarations.AddLine("layout(location = " +
-                                 std::to_string(static_cast<u32>(index) -
-                                                static_cast<u32>(Attribute::Index::Attribute_0)) +
-                                 ") in vec4 " + GetInputAttribute(index) + ';');
+            u32 idx =
+                static_cast<u32>(element.first) - static_cast<u32>(Attribute::Index::Attribute_0);
+            declarations.AddLine("layout(location = " + std::to_string(idx) + ")" +
+                                 GetInputFlags(element.first) + "in vec4 " +
+                                 GetInputAttribute(element.first, element.second) + ';');
         }
         declarations.AddNewLine();
 
@@ -440,13 +484,12 @@ public:
         }
         declarations.AddNewLine();
 
-        // Append the sampler2D array for the used textures.
-        size_t num_samplers = GetSamplers().size();
-        if (num_samplers > 0) {
-            declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' +
-                                 std::to_string(num_samplers) + "];");
-            declarations.AddNewLine();
+        const auto& samplers = GetSamplers();
+        for (const auto& sampler : samplers) {
+            declarations.AddLine("uniform " + sampler.GetTypeString() + ' ' + sampler.GetName() +
+                                 ';');
         }
+        declarations.AddNewLine();
     }
 
     /// Returns a list of constant buffer declarations
@@ -458,27 +501,29 @@ public:
     }
 
     /// Returns a list of samplers used in the shader
-    std::vector<SamplerEntry> GetSamplers() const {
+    const std::vector<SamplerEntry>& GetSamplers() const {
         return used_samplers;
     }
 
     /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
     /// necessary.
-    std::string AccessSampler(const Sampler& sampler) {
-        size_t offset = static_cast<size_t>(sampler.index.Value());
+    std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type,
+                              bool is_array) {
+        const std::size_t offset = static_cast<std::size_t>(sampler.index.Value());
 
         // If this sampler has already been used, return the existing mapping.
-        auto itr =
+        const auto itr =
             std::find_if(used_samplers.begin(), used_samplers.end(),
                          [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
 
         if (itr != used_samplers.end()) {
+            ASSERT(itr->GetType() == type && itr->IsArray() == is_array);
             return itr->GetName();
         }
 
         // Otherwise create a new mapping for this sampler
-        size_t next_index = used_samplers.size();
-        SamplerEntry entry{stage, offset, next_index};
+        const std::size_t next_index = used_samplers.size();
+        const SamplerEntry entry{stage, offset, next_index, type, is_array};
         used_samplers.emplace_back(entry);
         return entry.GetName();
     }
@@ -527,16 +572,29 @@ private:
     void BuildRegisterList() {
         regs.reserve(Register::NumRegisters);
 
-        for (size_t index = 0; index < Register::NumRegisters; ++index) {
+        for (std::size_t index = 0; index < Register::NumRegisters; ++index) {
             regs.emplace_back(index, suffix);
         }
     }
 
+    void BuildInputList() {
+        const u32 size = static_cast<u32>(Attribute::Index::Attribute_31) -
+                         static_cast<u32>(Attribute::Index::Attribute_0) + 1;
+        declr_input_attribute.reserve(size);
+    }
+
     /// Generates code representing an input attribute register.
-    std::string GetInputAttribute(Attribute::Index attribute) {
+    std::string GetInputAttribute(Attribute::Index attribute,
+                                  const Tegra::Shader::IpaMode& input_mode) {
         switch (attribute) {
         case Attribute::Index::Position:
-            return "position";
+            if (stage != Maxwell3D::Regs::ShaderStage::Fragment) {
+                return "position";
+            } else {
+                return "vec4(gl_FragCoord.x, gl_FragCoord.y, gl_FragCoord.z, 1.0)";
+            }
+        case Attribute::Index::PointCoord:
+            return "vec4(gl_PointCoord.x, gl_PointCoord.y, 0, 0)";
         case Attribute::Index::TessCoordInstanceIDVertexID:
             // TODO(Subv): Find out what the values are for the first two elements when inside a
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
@@ -552,7 +610,14 @@ private:
                             static_cast<u32>(Attribute::Index::Attribute_0)};
             if (attribute >= Attribute::Index::Attribute_0 &&
                 attribute <= Attribute::Index::Attribute_31) {
-                declr_input_attribute.insert(attribute);
+                if (declr_input_attribute.count(attribute) == 0) {
+                    declr_input_attribute[attribute] = input_mode;
+                } else {
+                    if (declr_input_attribute[attribute] != input_mode) {
+                        LOG_CRITICAL(HW_GPU, "Same Input multiple input modes");
+                        UNREACHABLE();
+                    }
+                }
                 return "input_attribute_" + std::to_string(index);
             }
 
@@ -563,6 +628,49 @@ private:
         return "vec4(0, 0, 0, 0)";
     }
 
+    std::string GetInputFlags(const Attribute::Index attribute) {
+        const Tegra::Shader::IpaSampleMode sample_mode =
+            declr_input_attribute[attribute].sampling_mode;
+        const Tegra::Shader::IpaInterpMode interp_mode =
+            declr_input_attribute[attribute].interpolation_mode;
+        std::string out;
+        switch (interp_mode) {
+        case Tegra::Shader::IpaInterpMode::Flat: {
+            out += "flat ";
+            break;
+        }
+        case Tegra::Shader::IpaInterpMode::Linear: {
+            out += "noperspective ";
+            break;
+        }
+        case Tegra::Shader::IpaInterpMode::Perspective: {
+            // Default, Smooth
+            break;
+        }
+        default: {
+            LOG_CRITICAL(HW_GPU, "Unhandled Ipa InterpMode: {}", static_cast<u32>(interp_mode));
+            UNREACHABLE();
+        }
+        }
+        switch (sample_mode) {
+        case Tegra::Shader::IpaSampleMode::Centroid: {
+            // Note not implemented, it can be implemented with the "centroid " keyword in glsl;
+            LOG_CRITICAL(HW_GPU, "Ipa Sampler Mode: centroid, not implemented");
+            UNREACHABLE();
+            break;
+        }
+        case Tegra::Shader::IpaSampleMode::Default: {
+            // Default, n/a
+            break;
+        }
+        default: {
+            LOG_CRITICAL(HW_GPU, "Unhandled Ipa SampleMode: {}", static_cast<u32>(sample_mode));
+            UNREACHABLE();
+        }
+        }
+        return out;
+    }
+
     /// Generates code representing an output attribute register.
     std::string GetOutputAttribute(Attribute::Index attribute) {
         switch (attribute) {
@@ -593,7 +701,7 @@ private:
     ShaderWriter& shader;
     ShaderWriter& declarations;
     std::vector<GLSLRegister> regs;
-    std::set<Attribute::Index> declr_input_attribute;
+    std::unordered_map<Attribute::Index, Tegra::Shader::IpaMode> declr_input_attribute;
     std::set<Attribute::Index> declr_output_attribute;
     std::array<ConstBufferEntry, Maxwell3D::Regs::MaxConstBuffers> declr_const_buffers;
     std::vector<SamplerEntry> used_samplers;
@@ -607,7 +715,7 @@ public:
                   u32 main_offset, Maxwell3D::Regs::ShaderStage stage, const std::string& suffix)
         : subroutines(subroutines), program_code(program_code), main_offset(main_offset),
           stage(stage), suffix(suffix) {
-
+        std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
         Generate(suffix);
     }
 
@@ -621,26 +729,9 @@ public:
     }
 
 private:
-    // Shader program header for a Fragment Shader.
-    struct FragmentHeader {
-        INSERT_PADDING_WORDS(5);
-        INSERT_PADDING_WORDS(13);
-        u32 enabled_color_outputs;
-        union {
-            BitField<0, 1, u32> writes_samplemask;
-            BitField<1, 1, u32> writes_depth;
-        };
-
-        bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const {
-            u32 bit = render_target * 4 + component;
-            return enabled_color_outputs & (1 << bit);
-        }
-    };
-    static_assert(sizeof(FragmentHeader) == PROGRAM_HEADER_SIZE, "FragmentHeader size is wrong");
-
     /// Gets the Subroutine object corresponding to the specified address.
     const Subroutine& GetSubroutine(u32 begin, u32 end) const {
-        auto iter = subroutines.find(Subroutine{begin, end, suffix});
+        const auto iter = subroutines.find(Subroutine{begin, end, suffix});
         ASSERT(iter != subroutines.end());
         return *iter;
     }
@@ -656,8 +747,8 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler) {
-        return regs.AccessSampler(sampler);
+    std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array) {
+        return regs.AccessSampler(sampler, type, is_array);
     }
 
     /**
@@ -685,7 +776,7 @@ private:
         // Can't assign to the constant predicate.
         ASSERT(pred != static_cast<u64>(Pred::UnusedIndex));
 
-        std::string variable = 'p' + std::to_string(pred) + '_' + suffix;
+        const std::string variable = 'p' + std::to_string(pred) + '_' + suffix;
         shader.AddLine(variable + " = " + value + ';');
         declr_predicates.insert(std::move(variable));
     }
@@ -795,7 +886,7 @@ private:
      */
     bool IsSchedInstruction(u32 offset) const {
         // sched instructions appear once every 4 instructions.
-        static constexpr size_t SchedPeriod = 4;
+        static constexpr std::size_t SchedPeriod = 4;
         u32 absolute_offset = offset - main_offset;
 
         return (absolute_offset % SchedPeriod) == 0;
@@ -863,7 +954,7 @@ private:
         std::string result;
         result += '(';
 
-        for (size_t i = 0; i < shift_amounts.size(); ++i) {
+        for (std::size_t i = 0; i < shift_amounts.size(); ++i) {
             if (i)
                 result += '|';
             result += "(((" + imm_lut + " >> (((" + op_c + " >> " + shift_amounts[i] +
@@ -887,7 +978,7 @@ private:
         // TEXS has two destination registers and a swizzle. The first two elements in the swizzle
         // go into gpr0+0 and gpr0+1, and the rest goes into gpr28+0 and gpr28+1
 
-        size_t written_components = 0;
+        std::size_t written_components = 0;
         for (u32 component = 0; component < 4; ++component) {
             if (!instr.texs.IsComponentEnabled(component)) {
                 continue;
@@ -941,10 +1032,8 @@ private:
     /// Writes the output values from a fragment shader to the corresponding GLSL output variables.
     void EmitFragmentOutputsWrite() {
         ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment);
-        FragmentHeader header;
-        std::memcpy(&header, program_code.data(), PROGRAM_HEADER_SIZE);
 
-        ASSERT_MSG(header.writes_samplemask == 0, "Samplemask write is unimplemented");
+        ASSERT_MSG(header.ps.omap.sample_mask == 0, "Samplemask write is unimplemented");
 
         // Write the color outputs using the data in the shader registers, disabled
         // rendertargets/components are skipped in the register assignment.
@@ -953,18 +1042,22 @@ private:
              ++render_target) {
             // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
             for (u32 component = 0; component < 4; ++component) {
-                if (header.IsColorComponentOutputEnabled(render_target, component)) {
-                    shader.AddLine(fmt::format("color[{}][{}] = {};", render_target, component,
+                if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
+                    shader.AddLine(fmt::format("FragColor{}[{}] = {};", render_target, component,
                                                regs.GetRegisterAsFloat(current_reg)));
                     ++current_reg;
                 }
             }
         }
 
-        if (header.writes_depth) {
+        if (header.ps.omap.depth) {
             // The depth output is always 2 registers after the last color output, and current_reg
             // already contains one past the last color register.
-            shader.AddLine("gl_FragDepth = " + regs.GetRegisterAsFloat(current_reg + 1) + ';');
+
+            shader.AddLine(
+                "gl_FragDepth = " +
+                regs.GetRegisterAsFloat(static_cast<Tegra::Shader::Register>(current_reg) + 1) +
+                ';');
         }
     }
 
@@ -1038,6 +1131,15 @@ private:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
                 // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
+                ASSERT_MSG(instr.fmul.tab5cb8_2 == 0, "FMUL tab5cb8_2({}) is not implemented",
+                           instr.fmul.tab5cb8_2.Value());
+                ASSERT_MSG(instr.fmul.tab5c68_1 == 0, "FMUL tab5cb8_1({}) is not implemented",
+                           instr.fmul.tab5c68_1.Value());
+                ASSERT_MSG(instr.fmul.tab5c68_0 == 1, "FMUL tab5cb8_0({}) is not implemented",
+                           instr.fmul.tab5c68_0
+                               .Value()); // SMO typical sends 1 here which seems to be the default
+                ASSERT_MSG(instr.fmul.cc == 0, "FMUL cc is not implemented");
+
                 op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
                                         instr.alu.saturate_d);
@@ -1357,7 +1459,7 @@ private:
                 if (instr.alu_integer.negate_b)
                     op_b = "-(" + op_b + ')';
 
-                std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
+                const std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
 
                 regs.SetRegisterToInteger(instr.gpr0, true, 0,
                                           "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
@@ -1375,7 +1477,7 @@ private:
             case OpCode::Id::SEL_C:
             case OpCode::Id::SEL_R:
             case OpCode::Id::SEL_IMM: {
-                std::string condition =
+                const std::string condition =
                     GetPredicateCondition(instr.sel.pred, instr.sel.neg_pred != 0);
                 regs.SetRegisterToInteger(instr.gpr0, true, 0,
                                           '(' + condition + ") ? " + op_a + " : " + op_b, 1, 1);
@@ -1397,8 +1499,9 @@ private:
             case OpCode::Id::LOP3_C:
             case OpCode::Id::LOP3_R:
             case OpCode::Id::LOP3_IMM: {
-                std::string op_c = regs.GetRegisterAsInteger(instr.gpr39);
+                const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39);
                 std::string lut;
+
                 if (opcode->GetId() == OpCode::Id::LOP3_R) {
                     lut = '(' + std::to_string(instr.alu.lop3.GetImmLut28()) + ')';
                 } else {
@@ -1413,15 +1516,80 @@ private:
             case OpCode::Id::IMNMX_IMM: {
                 ASSERT_MSG(instr.imnmx.exchange == Tegra::Shader::IMinMaxExchange::None,
                            "Unimplemented");
-                std::string condition =
+                const std::string condition =
                     GetPredicateCondition(instr.imnmx.pred, instr.imnmx.negate_pred != 0);
-                std::string parameters = op_a + ',' + op_b;
+                const std::string parameters = op_a + ',' + op_b;
                 regs.SetRegisterToInteger(instr.gpr0, instr.imnmx.is_signed, 0,
                                           '(' + condition + ") ? min(" + parameters + ") : max(" +
                                               parameters + ')',
                                           1, 1);
                 break;
             }
+            case OpCode::Id::LEA_R2:
+            case OpCode::Id::LEA_R1:
+            case OpCode::Id::LEA_IMM:
+            case OpCode::Id::LEA_RZ:
+            case OpCode::Id::LEA_HI: {
+                std::string op_c;
+
+                switch (opcode->GetId()) {
+                case OpCode::Id::LEA_R2: {
+                    op_a = regs.GetRegisterAsInteger(instr.gpr20);
+                    op_b = regs.GetRegisterAsInteger(instr.gpr39);
+                    op_c = std::to_string(instr.lea.r2.entry_a);
+                    break;
+                }
+
+                case OpCode::Id::LEA_R1: {
+                    const bool neg = instr.lea.r1.neg != 0;
+                    op_a = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_a = "-(" + op_a + ')';
+                    op_b = regs.GetRegisterAsInteger(instr.gpr20);
+                    op_c = std::to_string(instr.lea.r1.entry_a);
+                    break;
+                }
+
+                case OpCode::Id::LEA_IMM: {
+                    const bool neg = instr.lea.imm.neg != 0;
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_b = "-(" + op_b + ')';
+                    op_a = std::to_string(instr.lea.imm.entry_a);
+                    op_c = std::to_string(instr.lea.imm.entry_b);
+                    break;
+                }
+
+                case OpCode::Id::LEA_RZ: {
+                    const bool neg = instr.lea.rz.neg != 0;
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_b = "-(" + op_b + ')';
+                    op_a = regs.GetUniform(instr.lea.rz.cb_index, instr.lea.rz.cb_offset,
+                                           GLSLRegister::Type::Integer);
+                    op_c = std::to_string(instr.lea.rz.entry_a);
+
+                    break;
+                }
+
+                case OpCode::Id::LEA_HI:
+                default: {
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    op_a = std::to_string(instr.lea.imm.entry_a);
+                    op_c = std::to_string(instr.lea.imm.entry_b);
+                    LOG_CRITICAL(HW_GPU, "Unhandled LEA subinstruction: {}", opcode->GetName());
+                    UNREACHABLE();
+                }
+                }
+                if (instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex)) {
+                    LOG_ERROR(HW_GPU, "Unhandled LEA Predicate");
+                    UNREACHABLE();
+                }
+                const std::string value = '(' + op_a + " + (" + op_b + "*(1 << " + op_c + ")))";
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, value, 1, 1);
+
+                break;
+            }
             default: {
                 LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
                              opcode->GetName());
@@ -1432,10 +1600,16 @@ private:
             break;
         }
         case OpCode::Type::Ffma: {
-            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
+            const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
             std::string op_b = instr.ffma.negate_b ? "-" : "";
             std::string op_c = instr.ffma.negate_c ? "-" : "";
 
+            ASSERT_MSG(instr.ffma.cc == 0, "FFMA cc not implemented");
+            ASSERT_MSG(instr.ffma.tab5980_0 == 1, "FFMA tab5980_0({}) not implemented",
+                       instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
+            ASSERT_MSG(instr.ffma.tab5980_1 == 0, "FFMA tab5980_1({}) not implemented",
+                       instr.ffma.tab5980_1.Value());
+
             switch (opcode->GetId()) {
             case OpCode::Id::FFMA_CR: {
                 op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
@@ -1486,7 +1660,8 @@ private:
                 }
 
                 regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
+                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size,
+                                          instr.generates_cc.Value() != 0);
                 break;
             }
             case OpCode::Id::I2F_R:
@@ -1616,9 +1791,34 @@ private:
         case OpCode::Type::Memory: {
             switch (opcode->GetId()) {
             case OpCode::Id::LD_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element,
-                                                instr.attribute.fmt20.index);
+                // Note: Shouldn't this be interp mode flat? As in no interpolation made.
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
+
+                Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Perspective,
+                                                  Tegra::Shader::IpaSampleMode::Default};
+
+                u64 next_element = instr.attribute.fmt20.element;
+                u64 next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());
+
+                const auto LoadNextElement = [&](u32 reg_offset) {
+                    regs.SetRegisterToInputAttibute(instr.gpr0.Value() + reg_offset, next_element,
+                                                    static_cast<Attribute::Index>(next_index),
+                                                    input_mode);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    LoadNextElement(reg_offset);
+                }
                 break;
             }
             case OpCode::Id::LD_C: {
@@ -1632,7 +1832,7 @@ private:
                 shader.AddLine("uint index = (" + regs.GetRegisterAsInteger(instr.gpr8, 0, false) +
                                " / 4) & (MAX_CONSTBUFFER_ELEMENTS - 1);");
 
-                std::string op_a =
+                const std::string op_a =
                     regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, "index",
                                             GLSLRegister::Type::Float);
 
@@ -1642,7 +1842,7 @@ private:
                     break;
 
                 case Tegra::Shader::UniformType::Double: {
-                    std::string op_b =
+                    const std::string op_b =
                         regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4,
                                                 "index", GLSLRegister::Type::Float);
                     regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
@@ -1660,25 +1860,111 @@ private:
                 break;
             }
             case OpCode::Id::ST_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index,
-                                                  instr.attribute.fmt20.element, instr.gpr0);
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
+
+                u64 next_element = instr.attribute.fmt20.element;
+                u64 next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());
+
+                const auto StoreNextElement = [&](u32 reg_offset) {
+                    regs.SetOutputAttributeToRegister(static_cast<Attribute::Index>(next_index),
+                                                      next_element,
+                                                      instr.gpr0.Value() + reg_offset);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    StoreNextElement(reg_offset);
+                }
+
                 break;
             }
             case OpCode::Id::TEX: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                ASSERT_MSG(instr.tex.array == 0, "TEX arrays unimplemented");
+                Tegra::Shader::TextureType texture_type{instr.tex.texture_type};
+                std::string coord;
+
+                ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
+                           "AOFFI is not implemented");
+                ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
+                           "DC is not implemented");
+
+                switch (texture_type) {
+                case Tegra::Shader::TextureType::Texture1D: {
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    coord = "float coords = " + x + ';';
+                    break;
+                }
+                case Tegra::Shader::TextureType::Texture2D: {
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(texture_type));
+                    UNREACHABLE();
+
+                    // Fallback to interpreting as a 2D texture for now
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    texture_type = Tegra::Shader::TextureType::Texture2D;
+                }
+                // TODO: make sure coordinates are always indexed to gpr8 and gpr20 is always bias
+                // or lod.
+                const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20);
+
+                const std::string sampler = GetSampler(instr.sampler, texture_type, false);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
+
                 shader.AddLine("{");
                 ++shader.scope;
                 shader.AddLine(coord);
-                const std::string texture = "texture(" + sampler + ", coords)";
+                std::string texture;
 
-                size_t dest_elem{};
-                for (size_t elem = 0; elem < 4; ++elem) {
+                switch (instr.tex.process_mode) {
+                case Tegra::Shader::TextureProcessMode::None: {
+                    texture = "texture(" + sampler + ", coords)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LZ: {
+                    texture = "textureLod(" + sampler + ", coords, 0.0)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LB:
+                case Tegra::Shader::TextureProcessMode::LBA: {
+                    // TODO: Figure if A suffix changes the equation at all.
+                    texture = "texture(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LL:
+                case Tegra::Shader::TextureProcessMode::LLA: {
+                    // TODO: Figure if A suffix changes the equation at all.
+                    texture = "textureLod(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                default: {
+                    texture = "texture(" + sampler + ", coords)";
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}",
+                                 static_cast<u32>(instr.tex.process_mode.Value()));
+                    UNREACHABLE();
+                }
+                }
+                std::size_t dest_elem{};
+                for (std::size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -1691,20 +1977,77 @@ private:
                 break;
             }
             case OpCode::Id::TEXS: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                std::string coord;
+                Tegra::Shader::TextureType texture_type{instr.texs.GetTextureType()};
+                bool is_array{instr.texs.IsArrayTexture()};
+
+                ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
+                           "DC is not implemented");
+
+                switch (texture_type) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (is_array) {
+                        const std::string index = regs.GetRegisterAsInteger(instr.gpr8);
+                        const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                        const std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");";
+                    } else {
+                        const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                        const std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(texture_type));
+                    UNREACHABLE();
 
+                    // Fallback to interpreting as a 2D texture for now
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    texture_type = Tegra::Shader::TextureType::Texture2D;
+                    is_array = false;
+                }
+                const std::string sampler = GetSampler(instr.sampler, texture_type, is_array);
                 const std::string texture = "texture(" + sampler + ", coords)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
             case OpCode::Id::TLDS: {
-                const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsInteger(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "ivec2 coords = ivec2(" + op_a + ", " + op_b + ");";
+                ASSERT(instr.tlds.GetTextureType() == Tegra::Shader::TextureType::Texture2D);
+                ASSERT(instr.tlds.IsArrayTexture() == false);
+                std::string coord;
+
+                ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
+                           "AOFFI is not implemented");
+                ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::MZ),
+                           "MZ is not implemented");
+
+                switch (instr.tlds.GetTextureType()) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (instr.tlds.IsArrayTexture()) {
+                        LOG_CRITICAL(HW_GPU, "Unhandled 2d array texture");
+                        UNREACHABLE();
+                    } else {
+                        const std::string x = regs.GetRegisterAsInteger(instr.gpr8);
+                        const std::string y = regs.GetRegisterAsInteger(instr.gpr20);
+                        coord = "ivec2 coords = ivec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(instr.tlds.GetTextureType()));
+                    UNREACHABLE();
+                }
+                const std::string sampler = GetSampler(instr.sampler, instr.tlds.GetTextureType(),
+                                                       instr.tlds.IsArrayTexture());
                 const std::string texture = "texelFetch(" + sampler + ", coords, 0)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
@@ -1712,12 +2055,23 @@ private:
             case OpCode::Id::TLD4: {
                 ASSERT(instr.tld4.texture_type == Tegra::Shader::TextureType::Texture2D);
                 ASSERT(instr.tld4.array == 0);
-                std::string coord{};
+                std::string coord;
+
+                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
+                           "AOFFI is not implemented");
+                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
+                           "DC is not implemented");
+                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
+                           "NDV is not implemented");
+                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::PTP),
+                           "PTP is not implemented");
 
                 switch (instr.tld4.texture_type) {
                 case Tegra::Shader::TextureType::Texture2D: {
-                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
-                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                     coord = "vec2 coords = vec2(" + x + ", " + y + ");";
                     break;
                 }
@@ -1727,7 +2081,8 @@ private:
                     UNREACHABLE();
                 }
 
-                const std::string sampler = GetSampler(instr.sampler);
+                const std::string sampler =
+                    GetSampler(instr.sampler, instr.tld4.texture_type, false);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
@@ -1736,8 +2091,8 @@ private:
                 const std::string texture = "textureGather(" + sampler + ", coords, " +
                                             std::to_string(instr.tld4.component) + ')';
 
-                size_t dest_elem{};
-                for (size_t elem = 0; elem < 4; ++elem) {
+                std::size_t dest_elem{};
+                for (std::size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -1750,16 +2105,100 @@ private:
                 break;
             }
             case OpCode::Id::TLD4S: {
+                ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
+                           "AOFFI is not implemented");
+                ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
+                           "DC is not implemented");
+
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction.
-                const std::string sampler = GetSampler(instr.sampler);
+                const std::string sampler =
+                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
                 const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
                 const std::string texture = "textureGather(" + sampler + ", coords, " +
                                             std::to_string(instr.tld4s.component) + ')';
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
+            case OpCode::Id::TXQ: {
+                ASSERT_MSG(!instr.txq.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+
+                // TODO: the new commits on the texture refactor, change the way samplers work.
+                // Sadly, not all texture instructions specify the type of texture their sampler
+                // uses. This must be fixed at a later instance.
+                const std::string sampler =
+                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
+                switch (instr.txq.query_type) {
+                case Tegra::Shader::TextureQueryType::Dimension: {
+                    const std::string texture = "textureQueryLevels(" + sampler + ')';
+                    regs.SetRegisterToInteger(instr.gpr0, true, 0, texture, 1, 1);
+                    break;
+                }
+                default: {
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture query type: {}",
+                                 static_cast<u32>(instr.txq.query_type.Value()));
+                    UNREACHABLE();
+                }
+                }
+                break;
+            }
+            case OpCode::Id::TMML: {
+                ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
+                           "NODEP is not implemented");
+                ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
+                           "NDV is not implemented");
+
+                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
+                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                const bool is_array = instr.tmml.array != 0;
+                auto texture_type = instr.tmml.texture_type.Value();
+                const std::string sampler = GetSampler(instr.sampler, texture_type, is_array);
+
+                // TODO: add coordinates for different samplers once other texture types are
+                // implemented.
+                std::string coord;
+                switch (texture_type) {
+                case Tegra::Shader::TextureType::Texture1D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    coord = "float coords = " + x + ';';
+                    break;
+                }
+                case Tegra::Shader::TextureType::Texture2D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(texture_type));
+                    UNREACHABLE();
+
+                    // Fallback to interpreting as a 2D texture for now
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    texture_type = Tegra::Shader::TextureType::Texture2D;
+                }
+                // Add an extra scope and declare the texture coords inside to prevent
+                // overwriting them in case they are used as outputs of the texs instruction.
+                shader.AddLine('{');
+                ++shader.scope;
+                shader.AddLine(coord);
+                const std::string texture = "textureQueryLod(" + sampler + ", coords)";
+                const std::string tmp = "vec2 tmp = " + texture + "*vec2(256.0, 256.0);";
+                shader.AddLine(tmp);
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(tmp.y)", 1, 1);
+                regs.SetRegisterToInteger(instr.gpr0.Value() + 1, false, 0, "uint(tmp.x)", 1, 1);
+                --shader.scope;
+                shader.AddLine('}');
+                break;
+            }
             default: {
                 LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
                 UNREACHABLE();
@@ -1799,12 +2238,12 @@ private:
             // We can't use the constant predicate as destination.
             ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
 
-            std::string second_pred =
+            const std::string second_pred =
                 GetPredicateCondition(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
 
-            std::string combiner = GetPredicateCombiner(instr.fsetp.op);
+            const std::string combiner = GetPredicateCombiner(instr.fsetp.op);
 
-            std::string predicate = GetPredicateComparison(instr.fsetp.cond, op_a, op_b);
+            const std::string predicate = GetPredicateComparison(instr.fsetp.cond, op_a, op_b);
             // Set the primary predicate to the result of Predicate OP SecondPredicate
             SetPredicate(instr.fsetp.pred3,
                          '(' + predicate + ") " + combiner + " (" + second_pred + ')');
@@ -1818,7 +2257,8 @@ private:
             break;
         }
         case OpCode::Type::IntegerSetPredicate: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed);
+            const std::string op_a =
+                regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed);
             std::string op_b;
 
             if (instr.is_b_imm) {
@@ -1835,12 +2275,12 @@ private:
             // We can't use the constant predicate as destination.
             ASSERT(instr.isetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
 
-            std::string second_pred =
+            const std::string second_pred =
                 GetPredicateCondition(instr.isetp.pred39, instr.isetp.neg_pred != 0);
 
-            std::string combiner = GetPredicateCombiner(instr.isetp.op);
+            const std::string combiner = GetPredicateCombiner(instr.isetp.op);
 
-            std::string predicate = GetPredicateComparison(instr.isetp.cond, op_a, op_b);
+            const std::string predicate = GetPredicateComparison(instr.isetp.cond, op_a, op_b);
             // Set the primary predicate to the result of Predicate OP SecondPredicate
             SetPredicate(instr.isetp.pred3,
                          '(' + predicate + ") " + combiner + " (" + second_pred + ')');
@@ -1853,32 +2293,80 @@ private:
             }
             break;
         }
+        case OpCode::Type::PredicateSetRegister: {
+            const std::string op_a =
+                GetPredicateCondition(instr.pset.pred12, instr.pset.neg_pred12 != 0);
+            const std::string op_b =
+                GetPredicateCondition(instr.pset.pred29, instr.pset.neg_pred29 != 0);
+
+            const std::string second_pred =
+                GetPredicateCondition(instr.pset.pred39, instr.pset.neg_pred39 != 0);
+
+            const std::string combiner = GetPredicateCombiner(instr.pset.op);
+
+            const std::string predicate =
+                '(' + op_a + ") " + GetPredicateCombiner(instr.pset.cond) + " (" + op_b + ')';
+            const std::string result = '(' + predicate + ") " + combiner + " (" + second_pred + ')';
+            if (instr.pset.bf == 0) {
+                const std::string value = '(' + result + ") ? 0xFFFFFFFF : 0";
+                regs.SetRegisterToInteger(instr.gpr0, false, 0, value, 1, 1);
+            } else {
+                const std::string value = '(' + result + ") ? 1.0 : 0.0";
+                regs.SetRegisterToFloat(instr.gpr0, 0, value, 1, 1);
+            }
+
+            break;
+        }
         case OpCode::Type::PredicateSetPredicate: {
-            std::string op_a =
-                GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0);
-            std::string op_b =
-                GetPredicateCondition(instr.psetp.pred29, instr.psetp.neg_pred29 != 0);
+            switch (opcode->GetId()) {
+            case OpCode::Id::PSETP: {
+                const std::string op_a =
+                    GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0);
+                const std::string op_b =
+                    GetPredicateCondition(instr.psetp.pred29, instr.psetp.neg_pred29 != 0);
 
-            // We can't use the constant predicate as destination.
-            ASSERT(instr.psetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
+                // We can't use the constant predicate as destination.
+                ASSERT(instr.psetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
 
-            std::string second_pred =
-                GetPredicateCondition(instr.psetp.pred39, instr.psetp.neg_pred39 != 0);
+                const std::string second_pred =
+                    GetPredicateCondition(instr.psetp.pred39, instr.psetp.neg_pred39 != 0);
 
-            std::string combiner = GetPredicateCombiner(instr.psetp.op);
+                const std::string combiner = GetPredicateCombiner(instr.psetp.op);
 
-            std::string predicate =
-                '(' + op_a + ") " + GetPredicateCombiner(instr.psetp.cond) + " (" + op_b + ')';
+                const std::string predicate =
+                    '(' + op_a + ") " + GetPredicateCombiner(instr.psetp.cond) + " (" + op_b + ')';
 
-            // Set the primary predicate to the result of Predicate OP SecondPredicate
-            SetPredicate(instr.psetp.pred3,
-                         '(' + predicate + ") " + combiner + " (" + second_pred + ')');
+                // Set the primary predicate to the result of Predicate OP SecondPredicate
+                SetPredicate(instr.psetp.pred3,
+                             '(' + predicate + ") " + combiner + " (" + second_pred + ')');
 
-            if (instr.psetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-                // Set the secondary predicate to the result of !Predicate OP SecondPredicate,
-                // if enabled
-                SetPredicate(instr.psetp.pred0,
-                             "!(" + predicate + ") " + combiner + " (" + second_pred + ')');
+                if (instr.psetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
+                    // Set the secondary predicate to the result of !Predicate OP SecondPredicate,
+                    // if enabled
+                    SetPredicate(instr.psetp.pred0,
+                                 "!(" + predicate + ") " + combiner + " (" + second_pred + ')');
+                }
+                break;
+            }
+            case OpCode::Id::CSETP: {
+                const std::string pred =
+                    GetPredicateCondition(instr.csetp.pred39, instr.csetp.neg_pred39 != 0);
+                const std::string combiner = GetPredicateCombiner(instr.csetp.op);
+                const std::string controlCode = regs.GetControlCode(instr.csetp.cc);
+                if (instr.csetp.pred3 != static_cast<u64>(Pred::UnusedIndex)) {
+                    SetPredicate(instr.csetp.pred3,
+                                 '(' + controlCode + ") " + combiner + " (" + pred + ')');
+                }
+                if (instr.csetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
+                    SetPredicate(instr.csetp.pred0,
+                                 "!(" + controlCode + ") " + combiner + " (" + pred + ')');
+                }
+                break;
+            }
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled predicate instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
             }
             break;
         }
@@ -1893,7 +2381,7 @@ private:
             std::string op_b = instr.fset.neg_b ? "-" : "";
 
             if (instr.is_b_imm) {
-                std::string imm = GetImmediate19(instr);
+                const std::string imm = GetImmediate19(instr);
                 if (instr.fset.neg_imm)
                     op_b += "(-" + imm + ')';
                 else
@@ -1913,13 +2401,14 @@ private:
 
             // The fset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
             // condition is true, and to 0 otherwise.
-            std::string second_pred =
+            const std::string second_pred =
                 GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0);
 
-            std::string combiner = GetPredicateCombiner(instr.fset.op);
+            const std::string combiner = GetPredicateCombiner(instr.fset.op);
 
-            std::string predicate = "((" + GetPredicateComparison(instr.fset.cond, op_a, op_b) +
-                                    ") " + combiner + " (" + second_pred + "))";
+            const std::string predicate = "((" +
+                                          GetPredicateComparison(instr.fset.cond, op_a, op_b) +
+                                          ") " + combiner + " (" + second_pred + "))";
 
             if (instr.fset.bf) {
                 regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
@@ -1930,7 +2419,7 @@ private:
             break;
         }
         case OpCode::Type::IntegerSet: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed);
+            const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed);
 
             std::string op_b;
 
@@ -1947,13 +2436,14 @@ private:
 
             // The iset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
             // condition is true, and to 0 otherwise.
-            std::string second_pred =
+            const std::string second_pred =
                 GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0);
 
-            std::string combiner = GetPredicateCombiner(instr.iset.op);
+            const std::string combiner = GetPredicateCombiner(instr.iset.op);
 
-            std::string predicate = "((" + GetPredicateComparison(instr.iset.cond, op_a, op_b) +
-                                    ") " + combiner + " (" + second_pred + "))";
+            const std::string predicate = "((" +
+                                          GetPredicateComparison(instr.iset.cond, op_a, op_b) +
+                                          ") " + combiner + " (" + second_pred + "))";
 
             if (instr.iset.bf) {
                 regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
@@ -2103,45 +2593,22 @@ private:
             case OpCode::Id::BRA: {
                 ASSERT_MSG(instr.bra.constant_buffer == 0,
                            "BRA with constant buffers are not implemented");
-                u32 target = offset + instr.bra.GetBranchTarget();
+                const u32 target = offset + instr.bra.GetBranchTarget();
                 shader.AddLine("{ jmp_to = " + std::to_string(target) + "u; break; }");
                 break;
             }
             case OpCode::Id::IPA: {
                 const auto& attribute = instr.attribute.fmt28;
                 const auto& reg = instr.gpr0;
-                switch (instr.ipa.mode) {
-                case Tegra::Shader::IpaMode::Pass:
-                    if (stage == Maxwell3D::Regs::ShaderStage::Fragment &&
-                        attribute.index == Attribute::Index::Position) {
-                        switch (attribute.element) {
-                        case 0:
-                            shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.x;");
-                            break;
-                        case 1:
-                            shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.y;");
-                            break;
-                        case 2:
-                            shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.z;");
-                            break;
-                        case 3:
-                            shader.AddLine(regs.GetRegisterAsFloat(reg) + " = 1.0;");
-                            break;
-                        }
-                    } else {
-                        regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index);
-                    }
-                    break;
-                case Tegra::Shader::IpaMode::None:
-                    regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index);
-                    break;
-                default:
-                    LOG_CRITICAL(HW_GPU, "Unhandled IPA mode: {}",
-                                 static_cast<u32>(instr.ipa.mode.Value()));
-                    UNREACHABLE();
-                    regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index);
-                }
 
+                Tegra::Shader::IpaMode input_mode{instr.ipa.interp_mode.Value(),
+                                                  instr.ipa.sample_mode.Value()};
+                regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index,
+                                                input_mode);
+
+                if (instr.ipa.saturate) {
+                    regs.SetRegisterToFloat(reg, 0, regs.GetRegisterAsFloat(reg), 1, 1, true);
+                }
                 break;
             }
             case OpCode::Id::SSY: {
@@ -2150,7 +2617,7 @@ private:
                 // has a similar structure to the BRA opcode.
                 ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported");
 
-                u32 target = offset + instr.bra.GetBranchTarget();
+                const u32 target = offset + instr.bra.GetBranchTarget();
                 EmitPushToSSYStack(target);
                 break;
             }
@@ -2244,10 +2711,10 @@ private:
                     shader.AddLine("case " + std::to_string(label) + "u: {");
                     ++shader.scope;
 
-                    auto next_it = labels.lower_bound(label + 1);
-                    u32 next_label = next_it == labels.end() ? subroutine.end : *next_it;
+                    const auto next_it = labels.lower_bound(label + 1);
+                    const u32 next_label = next_it == labels.end() ? subroutine.end : *next_it;
 
-                    u32 compile_end = CompileRange(label, next_label);
+                    const u32 compile_end = CompileRange(label, next_label);
                     if (compile_end > next_label && compile_end != PROGRAM_END) {
                         // This happens only when there is a label inside a IF/LOOP block
                         shader.AddLine(" jmp_to = " + std::to_string(compile_end) + "u; break; }");
@@ -2289,6 +2756,7 @@ private:
 private:
     const std::set<Subroutine>& subroutines;
     const ProgramCode& program_code;
+    Tegra::Shader::Header header;
     const u32 main_offset;
     Maxwell3D::Regs::ShaderStage stage;
     const std::string& suffix;
@@ -2310,7 +2778,8 @@ boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code,
                                                 Maxwell3D::Regs::ShaderStage stage,
                                                 const std::string& suffix) {
     try {
-        auto subroutines = ControlFlowAnalyzer(program_code, main_offset, suffix).GetSubroutines();
+        const auto subroutines =
+            ControlFlowAnalyzer(program_code, main_offset, suffix).GetSubroutines();
         GLSLGenerator generator(subroutines, program_code, main_offset, stage, suffix);
         return ProgramResult{generator.GetShaderCode(), generator.GetEntries()};
     } catch (const DecompileFail& exception) {
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 6ca05945e..b0466c18f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -42,6 +42,7 @@ layout (std140) uniform vs_config {
 };
 
 void main() {
+    position = vec4(0.0, 0.0, 0.0, 0.0);
     exec_vertex();
 )";
 
@@ -87,7 +88,14 @@ ProgramResult GenerateFragmentShader(const ShaderSetup& setup) {
             .get_value_or({});
     out += R"(
 in vec4 position;
-layout(location = 0) out vec4 color[8];
+layout(location = 0) out vec4 FragColor0;
+layout(location = 1) out vec4 FragColor1;
+layout(location = 2) out vec4 FragColor2;
+layout(location = 3) out vec4 FragColor3;
+layout(location = 4) out vec4 FragColor4;
+layout(location = 5) out vec4 FragColor5;
+layout(location = 6) out vec4 FragColor6;
+layout(location = 7) out vec4 FragColor7;
 
 layout (std140) uniform fs_config {
     vec4 viewport_flip;
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index c788099d4..d53b93ad5 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -9,10 +9,11 @@
 #include <vector>
 
 #include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
 
 namespace OpenGL::GLShader {
 
-constexpr size_t MAX_PROGRAM_CODE_LENGTH{0x1000};
+constexpr std::size_t MAX_PROGRAM_CODE_LENGTH{0x1000};
 using ProgramCode = std::vector<u64>;
 
 class ConstBufferEntry {
@@ -50,7 +51,11 @@ public:
     }
 
     std::string GetName() const {
-        return BufferBaseNames[static_cast<size_t>(stage)] + std::to_string(index);
+        return BufferBaseNames[static_cast<std::size_t>(stage)] + std::to_string(index);
+    }
+
+    u32 GetHash() const {
+        return (static_cast<u32>(stage) << 16) | index;
     }
 
 private:
@@ -69,14 +74,15 @@ class SamplerEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index)
-        : offset(offset), stage(stage), sampler_index(index) {}
+    SamplerEntry(Maxwell::ShaderStage stage, std::size_t offset, std::size_t index,
+                 Tegra::Shader::TextureType type, bool is_array)
+        : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {}
 
-    size_t GetOffset() const {
+    std::size_t GetOffset() const {
         return offset;
     }
 
-    size_t GetIndex() const {
+    std::size_t GetIndex() const {
         return sampler_index;
     }
 
@@ -85,23 +91,63 @@ public:
     }
 
     std::string GetName() const {
-        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' +
-               std::to_string(sampler_index) + ']';
+        return std::string(TextureSamplerNames[static_cast<std::size_t>(stage)]) + '_' +
+               std::to_string(sampler_index);
+    }
+
+    std::string GetTypeString() const {
+        using Tegra::Shader::TextureType;
+        std::string glsl_type;
+
+        switch (type) {
+        case TextureType::Texture1D:
+            glsl_type = "sampler1D";
+            break;
+        case TextureType::Texture2D:
+            glsl_type = "sampler2D";
+            break;
+        case TextureType::Texture3D:
+            glsl_type = "sampler3D";
+            break;
+        case TextureType::TextureCube:
+            glsl_type = "samplerCube";
+            break;
+        default:
+            UNIMPLEMENTED();
+        }
+        if (is_array)
+            glsl_type += "Array";
+        return glsl_type;
+    }
+
+    Tegra::Shader::TextureType GetType() const {
+        return type;
+    }
+
+    bool IsArray() const {
+        return is_array;
+    }
+
+    u32 GetHash() const {
+        return (static_cast<u32>(stage) << 16) | static_cast<u32>(sampler_index);
     }
 
     static std::string GetArrayName(Maxwell::ShaderStage stage) {
-        return TextureSamplerNames[static_cast<size_t>(stage)];
+        return TextureSamplerNames[static_cast<std::size_t>(stage)];
     }
 
 private:
     static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = {
         "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs",
     };
+
     /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
     /// instruction.
-    size_t offset;
-    Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used.
-    size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+    std::size_t offset;
+    Maxwell::ShaderStage stage;      ///< Shader stage where this sampler was used.
+    std::size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+    Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc)
+    bool is_array; ///< Whether the texture is being sampled as an array texture or not.
 };
 
 struct ShaderEntries {
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 533e42caa..b86cd96e8 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -12,7 +12,7 @@
 namespace OpenGL::GLShader {
 
 /// Number of OpenGL texture samplers that can be used in the fragment shader
-static constexpr size_t NumTextureSamplers = 32;
+static constexpr std::size_t NumTextureSamplers = 32;
 
 using Tegra::Engines::Maxwell3D;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 5781d9d16..5f3fe067e 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -25,7 +25,7 @@ GLuint LoadShader(const char* source, GLenum type) {
     default:
         UNREACHABLE();
     }
-    GLuint shader_id = glCreateShader(type);
+    const GLuint shader_id = glCreateShader(type);
     glShaderSource(shader_id, 1, &source, nullptr);
     LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
     glCompileShader(shader_id);
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 60a4defd1..af99132ba 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -200,9 +200,9 @@ void OpenGLState::Apply() const {
         const auto& texture_unit = texture_units[i];
         const auto& cur_state_texture_unit = cur_state.texture_units[i];
 
-        if (texture_unit.texture_2d != cur_state_texture_unit.texture_2d) {
+        if (texture_unit.texture != cur_state_texture_unit.texture) {
             glActiveTexture(TextureUnits::MaxwellTexture(static_cast<int>(i)).Enum());
-            glBindTexture(GL_TEXTURE_2D, texture_unit.texture_2d);
+            glBindTexture(texture_unit.target, texture_unit.texture);
         }
         if (texture_unit.sampler != cur_state_texture_unit.sampler) {
             glBindSampler(static_cast<GLuint>(i), texture_unit.sampler);
@@ -214,7 +214,7 @@ void OpenGLState::Apply() const {
             texture_unit.swizzle.a != cur_state_texture_unit.swizzle.a) {
             std::array<GLint, 4> mask = {texture_unit.swizzle.r, texture_unit.swizzle.g,
                                          texture_unit.swizzle.b, texture_unit.swizzle.a};
-            glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, mask.data());
+            glTexParameteriv(texture_unit.target, GL_TEXTURE_SWIZZLE_RGBA, mask.data());
         }
     }
 
@@ -272,7 +272,7 @@ void OpenGLState::Apply() const {
     }
 
     // Clip distance
-    for (size_t i = 0; i < clip_distance.size(); ++i) {
+    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
         if (clip_distance[i] != cur_state.clip_distance[i]) {
             if (clip_distance[i]) {
                 glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
@@ -287,7 +287,7 @@ void OpenGLState::Apply() const {
 
 OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
     for (auto& unit : texture_units) {
-        if (unit.texture_2d == handle) {
+        if (unit.texture == handle) {
             unit.Unbind();
         }
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 46e96a97d..e3e24b9e7 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -94,8 +94,9 @@ public:
 
     // 3 texture units - one for each that is used in PICA fragment shader emulation
     struct TextureUnit {
-        GLuint texture_2d; // GL_TEXTURE_BINDING_2D
-        GLuint sampler;    // GL_SAMPLER_BINDING
+        GLuint texture; // GL_TEXTURE_BINDING_2D
+        GLuint sampler; // GL_SAMPLER_BINDING
+        GLenum target;
         struct {
             GLint r; // GL_TEXTURE_SWIZZLE_R
             GLint g; // GL_TEXTURE_SWIZZLE_G
@@ -104,7 +105,7 @@ public:
         } swizzle;
 
         void Unbind() {
-            texture_2d = 0;
+            texture = 0;
             swizzle.r = GL_RED;
             swizzle.g = GL_GREEN;
             swizzle.b = GL_BLUE;
@@ -114,6 +115,7 @@ public:
         void Reset() {
             Unbind();
             sampler = 0;
+            target = GL_TEXTURE_2D;
         }
     };
     std::array<TextureUnit, 32> texture_units;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index e565afcee..664f3ca20 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -29,7 +29,7 @@ OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coh
     if (GLAD_GL_ARB_buffer_storage) {
         persistent = true;
         coherent = prefer_coherent;
-        GLbitfield flags =
+        const GLbitfield flags =
             GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
         glBufferStorage(gl_target, allocate_size, nullptr, flags);
         mapped_ptr = static_cast<u8*>(glMapBufferRange(
@@ -61,7 +61,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
     mapped_size = size;
 
     if (alignment > 0) {
-        buffer_pos = Common::AlignUp<size_t>(buffer_pos, alignment);
+        buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
     }
 
     bool invalidate = false;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 411a73d50..96d916b07 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -177,7 +177,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
                                        Memory::GetPointer(framebuffer_addr),
                                        gl_framebuffer_data.data(), true);
 
-        state.texture_units[0].texture_2d = screen_info.texture.resource.handle;
+        state.texture_units[0].texture = screen_info.texture.resource.handle;
         state.Apply();
 
         glActiveTexture(GL_TEXTURE0);
@@ -194,7 +194,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
 
         glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 
-        state.texture_units[0].texture_2d = 0;
+        state.texture_units[0].texture = 0;
         state.Apply();
     }
 }
@@ -205,7 +205,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
  */
 void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
                                                 const TextureInfo& texture) {
-    state.texture_units[0].texture_2d = texture.resource.handle;
+    state.texture_units[0].texture = texture.resource.handle;
     state.Apply();
 
     glActiveTexture(GL_TEXTURE0);
@@ -214,7 +214,7 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
     // Update existing texture
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 1, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, framebuffer_data);
 
-    state.texture_units[0].texture_2d = 0;
+    state.texture_units[0].texture = 0;
     state.Apply();
 }
 
@@ -260,7 +260,7 @@ void RendererOpenGL::InitOpenGLObjects() {
     // Allocation of storage is deferred until the first frame, when we
     // know the framebuffer size.
 
-    state.texture_units[0].texture_2d = screen_info.texture.resource.handle;
+    state.texture_units[0].texture = screen_info.texture.resource.handle;
     state.Apply();
 
     glActiveTexture(GL_TEXTURE0);
@@ -272,7 +272,7 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     screen_info.display_texture = screen_info.texture.resource.handle;
 
-    state.texture_units[0].texture_2d = 0;
+    state.texture_units[0].texture = 0;
     state.Apply();
 
     // Clear screen to black
@@ -305,14 +305,14 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
         UNREACHABLE();
     }
 
-    state.texture_units[0].texture_2d = texture.resource.handle;
+    state.texture_units[0].texture = texture.resource.handle;
     state.Apply();
 
     glActiveTexture(GL_TEXTURE0);
     glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0,
                  texture.gl_format, texture.gl_type, nullptr);
 
-    state.texture_units[0].texture_2d = 0;
+    state.texture_units[0].texture = 0;
     state.Apply();
 }
 
@@ -354,14 +354,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
         ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v),
     }};
 
-    state.texture_units[0].texture_2d = screen_info.display_texture;
+    state.texture_units[0].texture = screen_info.display_texture;
     state.texture_units[0].swizzle = {GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA};
     state.Apply();
 
     glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices.data());
     glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
 
-    state.texture_units[0].texture_2d = 0;
+    state.texture_units[0].texture = 0;
     state.Apply();
 }
 
@@ -369,6 +369,12 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
  * Draws the emulated screens to the emulator window.
  */
 void RendererOpenGL::DrawScreen() {
+    if (renderer_settings.set_background_color) {
+        // Update background color before drawing
+        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
+                     0.0f);
+    }
+
     const auto& layout = render_window.GetFramebufferLayout();
     const auto& screen = layout.screen;
 
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 272294c62..20ba6d4f6 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -46,6 +46,48 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_
     }
 }
 
+template <std::size_t N, std::size_t M>
+struct alignas(64) SwizzleTable {
+    constexpr SwizzleTable() {
+        for (u32 y = 0; y < N; ++y) {
+            for (u32 x = 0; x < M; ++x) {
+                const u32 x2 = x * 16;
+                values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
+                                                ((x2 % 32) / 16) * 32 + (y % 2) * 16);
+            }
+        }
+    }
+    const std::array<u16, M>& operator[](std::size_t index) const {
+        return values[index];
+    }
+    std::array<std::array<u16, M>, N> values{};
+};
+
+constexpr auto swizzle_table = SwizzleTable<8, 4>();
+
+void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_data,
+                     u8* unswizzled_data, bool unswizzle, u32 block_height) {
+    std::array<u8*, 2> data_ptrs;
+    const std::size_t stride{width * bytes_per_pixel};
+    const std::size_t image_width_in_gobs{(stride + 63) / 64};
+    const std::size_t copy_size{16};
+    for (std::size_t y = 0; y < height; ++y) {
+        const std::size_t initial_gob =
+            (y / (8 * block_height)) * 512 * block_height * image_width_in_gobs +
+            (y % (8 * block_height) / 8) * 512;
+        const std::size_t pixel_base{y * width * bytes_per_pixel};
+        const auto& table = swizzle_table[y % 8];
+        for (std::size_t xb = 0; xb < stride; xb += copy_size) {
+            const std::size_t gob_address{initial_gob + (xb / 64) * 512 * block_height};
+            const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]};
+            const std::size_t pixel_index{xb + pixel_base};
+            data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
+            data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
+            std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
+        }
+    }
+}
+
 u32 BytesPerPixel(TextureFormat format) {
     switch (format) {
     case TextureFormat::DXT1:
@@ -63,6 +105,7 @@ u32 BytesPerPixel(TextureFormat format) {
     case TextureFormat::R32_G32_B32:
         return 12;
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::BF10GF11RF11:
@@ -91,8 +134,13 @@ u32 BytesPerPixel(TextureFormat format) {
 std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width,
                                  u32 height, u32 block_height) {
     std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
-    CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel,
-                     Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % 16 == 0) {
+        FastSwizzleData(width / tile_size, height / tile_size, bytes_per_pixel,
+                        Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    } else {
+        CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel,
+                         Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    }
     return unswizzled_data;
 }
 
@@ -111,6 +159,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::BC6H_UF16:
     case TextureFormat::BC6H_SF16:
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A1B5G5R5:
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index c6bd2f4b9..c2fb824b2 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -170,8 +170,12 @@ struct TICEntry {
         BitField<0, 16, u32> width_minus_1;
         BitField<23, 4, TextureType> texture_type;
     };
-    u16 height_minus_1;
-    INSERT_PADDING_BYTES(10);
+    union {
+        BitField<0, 16, u32> height_minus_1;
+        BitField<16, 15, u32> depth_minus_1;
+    };
+
+    INSERT_PADDING_BYTES(8);
 
     GPUVAddr Address() const {
         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
@@ -192,6 +196,10 @@ struct TICEntry {
         return height_minus_1 + 1;
     }
 
+    u32 Depth() const {
+        return depth_minus_1 + 1;
+    }
+
     u32 BlockHeight() const {
         ASSERT(header_version == TICHeaderVersion::BlockLinear ||
                header_version == TICHeaderVersion::BlockLinearColorKey);