27 files changed, 2885 insertions, 380 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 281810357..c6431e722 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(video_core STATIC
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
     engines/maxwell_compute.h
+    engines/maxwell_dma.cpp
+    engines/maxwell_dma.h
     engines/shader_bytecode.h
     gpu.cpp
     gpu.h
@@ -39,6 +41,8 @@ add_library(video_core STATIC
     renderer_opengl/maxwell_to_gl.h
     renderer_opengl/renderer_opengl.cpp
     renderer_opengl/renderer_opengl.h
+    textures/astc.cpp
+    textures/astc.h
     textures/decoders.cpp
     textures/decoders.h
     textures/texture.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index d72d6f760..cec9cb9f3 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -16,6 +16,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
@@ -60,8 +61,11 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
     case EngineID::MAXWELL_COMPUTE_B:
         maxwell_compute->WriteReg(method, value);
         break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->WriteReg(method, value);
+        break;
     default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented engine");
     }
 }
 
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 6b9382f06..998b7c843 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -47,6 +47,7 @@ void Fermi2D::HandleSurfaceCopy() {
 
     if (regs.src.linear == regs.dst.linear) {
         // If the input layout and the output layout are the same, just perform a raw copy.
+        ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
         Memory::CopyBlock(dest_cpu, source_cpu,
                           src_bytes_per_pixel * regs.dst.width * regs.dst.height);
         return;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index ef12d9300..93c43c8cb 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -328,8 +328,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
 
         Texture::FullTextureInfo tex_info{};
         // TODO(Subv): Use the shader to determine which textures are actually accessed.
-        tex_info.index = (current_texture - tex_info_buffer.address - TextureInfoOffset) /
-                         sizeof(Texture::TextureHandle);
+        tex_info.index =
+            static_cast<u32>(current_texture - tex_info_buffer.address - TextureInfoOffset) /
+            sizeof(Texture::TextureHandle);
 
         // Load the TIC data.
         if (tex_handle.tic_id != 0) {
@@ -354,6 +355,40 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
     return textures;
 }
 
+Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, size_t offset) const {
+    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
+    auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
+    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
+
+    GPUVAddr tex_info_address = tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
+
+    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
+
+    boost::optional<VAddr> tex_address_cpu = memory_manager.GpuToCpuAddress(tex_info_address);
+    Texture::TextureHandle tex_handle{Memory::Read32(*tex_address_cpu)};
+
+    Texture::FullTextureInfo tex_info{};
+    tex_info.index = static_cast<u32>(offset);
+
+    // Load the TIC data.
+    if (tex_handle.tic_id != 0) {
+        tex_info.enabled = true;
+
+        auto tic_entry = GetTICEntry(tex_handle.tic_id);
+        // TODO(Subv): Workaround for BitField's move constructor being deleted.
+        std::memcpy(&tex_info.tic, &tic_entry, sizeof(tic_entry));
+    }
+
+    // Load the TSC data
+    if (tex_handle.tsc_id != 0) {
+        auto tsc_entry = GetTSCEntry(tex_handle.tsc_id);
+        // TODO(Subv): Workaround for BitField's move constructor being deleted.
+        std::memcpy(&tex_info.tsc, &tsc_entry, sizeof(tsc_entry));
+    }
+
+    return tex_info;
+}
+
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
     ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
     return regs.reg_array[method];
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 245410c95..2dc251205 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -318,6 +318,7 @@ public:
             Equation equation_a;
             Factor factor_source_a;
             Factor factor_dest_a;
+            INSERT_PADDING_WORDS(1);
         };
 
         union {
@@ -432,7 +433,27 @@ public:
                     };
                 } rt_control;
 
-                INSERT_PADDING_WORDS(0xCF);
+                INSERT_PADDING_WORDS(0x31);
+
+                u32 independent_blend_enable;
+
+                INSERT_PADDING_WORDS(0x15);
+
+                struct {
+                    u32 separate_alpha;
+                    Blend::Equation equation_rgb;
+                    Blend::Factor factor_source_rgb;
+                    Blend::Factor factor_dest_rgb;
+                    Blend::Equation equation_a;
+                    Blend::Factor factor_source_a;
+                    INSERT_PADDING_WORDS(1);
+                    Blend::Factor factor_dest_a;
+
+                    u32 enable_common;
+                    u32 enable[NumRenderTargets];
+                } blend;
+
+                INSERT_PADDING_WORDS(0x77);
 
                 struct {
                     u32 tsc_address_high;
@@ -557,9 +578,7 @@ public:
 
                 } vertex_array[NumVertexArrays];
 
-                Blend blend;
-
-                INSERT_PADDING_WORDS(0x39);
+                Blend independent_blend[NumRenderTargets];
 
                 struct {
                     u32 limit_high;
@@ -664,6 +683,9 @@ public:
     /// Returns a list of enabled textures for the specified shader stage.
     std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
 
+    /// Returns the texture information for a specific texture in a specific shader stage.
+    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, size_t offset) const;
+
     /// Returns whether the specified shader stage is enabled or not.
     bool IsShaderStageEnabled(Regs::ShaderStage stage) const;
 
@@ -719,6 +741,8 @@ ASSERT_REG_POSITION(vertex_buffer, 0x35D);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
+ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
+ASSERT_REG_POSITION(blend, 0x4CF);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(tic, 0x55D);
 ASSERT_REG_POSITION(code_address, 0x582);
@@ -726,7 +750,7 @@ ASSERT_REG_POSITION(draw, 0x585);
 ASSERT_REG_POSITION(index_array, 0x5F2);
 ASSERT_REG_POSITION(query, 0x6C0);
 ASSERT_REG_POSITION(vertex_array[0], 0x700);
-ASSERT_REG_POSITION(blend, 0x780);
+ASSERT_REG_POSITION(independent_blend, 0x780);
 ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0);
 ASSERT_REG_POSITION(shader_config[0], 0x800);
 ASSERT_REG_POSITION(const_buffer, 0x8E0);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
new file mode 100644
index 000000000..442138988
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/memory.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+namespace Engines {
+
+MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+
+void MaxwellDMA::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid MaxwellDMA register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+#define MAXWELLDMA_REG_INDEX(field_name)                                                           \
+    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32))
+
+    switch (method) {
+    case MAXWELLDMA_REG_INDEX(exec): {
+        HandleCopy();
+        break;
+    }
+    }
+
+#undef MAXWELLDMA_REG_INDEX
+}
+
+void MaxwellDMA::HandleCopy() {
+    NGLOG_WARNING(HW_GPU, "Requested a DMA copy");
+
+    const GPUVAddr source = regs.src_address.Address();
+    const GPUVAddr dest = regs.dst_address.Address();
+
+    const VAddr source_cpu = *memory_manager.GpuToCpuAddress(source);
+    const VAddr dest_cpu = *memory_manager.GpuToCpuAddress(dest);
+
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    ASSERT(regs.exec.enable_swizzle == 0);
+    ASSERT(regs.exec.enable_2d == 1);
+    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
+    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
+    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
+    ASSERT(regs.src_params.pos_x == 0);
+    ASSERT(regs.src_params.pos_y == 0);
+    ASSERT(regs.dst_params.pos_x == 0);
+    ASSERT(regs.dst_params.pos_y == 0);
+    ASSERT(regs.exec.is_dst_linear != regs.exec.is_src_linear);
+
+    u8* src_buffer = Memory::GetPointer(source_cpu);
+    u8* dst_buffer = Memory::GetPointer(dest_cpu);
+
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, 1, 1, src_buffer,
+                                  dst_buffer, true, regs.src_params.BlockHeight());
+    } else {
+        // If the input is linear and the output is tiled, swizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
+                                  src_buffer, false, regs.dst_params.BlockHeight());
+    }
+}
+
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
new file mode 100644
index 000000000..905749bde
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,155 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+namespace Engines {
+
+class MaxwellDMA final {
+public:
+    explicit MaxwellDMA(MemoryManager& memory_manager);
+    ~MaxwellDMA() = default;
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x1D6;
+
+        struct Parameters {
+            union {
+                BitField<0, 4, u32> block_depth;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_width;
+            };
+            u32 size_x;
+            u32 size_y;
+            u32 size_z;
+            u32 pos_z;
+            union {
+                BitField<0, 16, u32> pos_x;
+                BitField<16, 16, u32> pos_y;
+            };
+
+            u32 BlockHeight() const {
+                return 1 << block_height;
+            }
+        };
+
+        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
+
+        enum class CopyMode : u32 {
+            None = 0,
+            Unk1 = 1,
+            Unk2 = 2,
+        };
+
+        enum class QueryMode : u32 {
+            None = 0,
+            Short = 1,
+            Long = 2,
+        };
+
+        enum class QueryIntr : u32 {
+            None = 0,
+            Block = 1,
+            NonBlock = 2,
+        };
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0xC0);
+
+                struct {
+                    union {
+                        BitField<0, 2, CopyMode> copy_mode;
+                        BitField<2, 1, u32> flush;
+
+                        BitField<3, 2, QueryMode> query_mode;
+                        BitField<5, 2, QueryIntr> query_intr;
+
+                        BitField<7, 1, u32> is_src_linear;
+                        BitField<8, 1, u32> is_dst_linear;
+
+                        BitField<9, 1, u32> enable_2d;
+                        BitField<10, 1, u32> enable_swizzle;
+                    };
+                } exec;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } src_address;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dst_address;
+
+                u32 src_pitch;
+                u32 dst_pitch;
+                u32 x_count;
+                u32 y_count;
+
+                INSERT_PADDING_WORDS(0xBB);
+
+                Parameters dst_params;
+
+                INSERT_PADDING_WORDS(1);
+
+                Parameters src_params;
+
+                INSERT_PADDING_WORDS(0x13);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    MemoryManager& memory_manager;
+
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void HandleCopy();
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(exec, 0xC0);
+ASSERT_REG_POSITION(src_address, 0x100);
+ASSERT_REG_POSITION(dst_address, 0x102);
+ASSERT_REG_POSITION(src_pitch, 0x104);
+ASSERT_REG_POSITION(dst_pitch, 0x105);
+ASSERT_REG_POSITION(x_count, 0x106);
+ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(dst_params, 0x1C3);
+ASSERT_REG_POSITION(src_params, 0x1CA);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 38757c038..cb4db0679 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -109,11 +109,6 @@ union Sampler {
     u64 value{};
 };
 
-union Uniform {
-    BitField<20, 14, u64> offset;
-    BitField<34, 5, u64> index;
-};
-
 } // namespace Shader
 } // namespace Tegra
 
@@ -173,6 +168,31 @@ enum class SubOp : u64 {
     Min = 0x8,
 };
 
+enum class F2iRoundingOp : u64 {
+    None = 0,
+    Floor = 1,
+    Ceil = 2,
+    Trunc = 3,
+};
+
+enum class F2fRoundingOp : u64 {
+    None = 0,
+    Pass = 3,
+    Round = 8,
+    Floor = 9,
+    Ceil = 10,
+    Trunc = 11,
+};
+
+enum class UniformType : u64 {
+    UnsignedByte = 0,
+    SignedByte = 1,
+    UnsignedShort = 2,
+    SignedShort = 3,
+    Single = 4,
+    Double = 5,
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -196,12 +216,12 @@ union Instruction {
 
     union {
         BitField<20, 19, u64> imm20_19;
-        BitField<20, 32, u64> imm20_32;
+        BitField<20, 32, s64> imm20_32;
         BitField<45, 1, u64> negate_b;
         BitField<46, 1, u64> abs_a;
         BitField<48, 1, u64> negate_a;
         BitField<49, 1, u64> abs_b;
-        BitField<50, 1, u64> abs_d;
+        BitField<50, 1, u64> saturate_d;
         BitField<56, 1, u64> negate_imm;
 
         union {
@@ -210,10 +230,18 @@ union Instruction {
         } fmnmx;
 
         union {
+            BitField<39, 1, u64> invert_a;
+            BitField<40, 1, u64> invert_b;
+            BitField<41, 2, LogicOperation> operation;
+            BitField<44, 2, u64> unk44;
+            BitField<48, 3, Pred> pred48;
+        } lop;
+
+        union {
             BitField<53, 2, LogicOperation> operation;
             BitField<55, 1, u64> invert_a;
             BitField<56, 1, u64> invert_b;
-        } lop;
+        } lop32i;
 
         float GetImm20_19() const {
             float result{};
@@ -226,7 +254,7 @@ union Instruction {
 
         float GetImm20_32() const {
             float result{};
-            u32 imm{static_cast<u32>(imm20_32)};
+            s32 imm{static_cast<s32>(imm20_32)};
             std::memcpy(&result, &imm, sizeof(imm));
             return result;
         }
@@ -240,10 +268,30 @@ union Instruction {
     } alu;
 
     union {
+        BitField<48, 1, u64> is_signed;
+    } shift;
+
+    union {
         BitField<39, 5, u64> shift_amount;
         BitField<48, 1, u64> negate_b;
         BitField<49, 1, u64> negate_a;
-    } iscadd;
+    } alu_integer;
+
+    union {
+        BitField<54, 1, u64> saturate;
+        BitField<56, 1, u64> negate_a;
+    } iadd32i;
+
+    union {
+        BitField<20, 8, u64> shift_position;
+        BitField<28, 8, u64> shift_length;
+        BitField<48, 1, u64> negate_b;
+        BitField<49, 1, u64> negate_a;
+
+        u64 GetLeftShiftValue() const {
+            return 32 - (shift_position + shift_length);
+        }
+    } bfe;
 
     union {
         BitField<48, 1, u64> negate_b;
@@ -251,6 +299,11 @@ union Instruction {
     } ffma;
 
     union {
+        BitField<48, 3, UniformType> type;
+        BitField<44, 2, u64> unknown;
+    } ld_c;
+
+    union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
         BitField<7, 1, u64> abs_a;
@@ -289,19 +342,37 @@ union Instruction {
     } fset;
 
     union {
-        BitField<10, 2, Register::Size> size;
-        BitField<13, 1, u64> is_signed;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred;
+        BitField<44, 1, u64> bf;
+        BitField<45, 2, PredOperation> op;
+        BitField<48, 1, u64> is_signed;
+        BitField<49, 3, PredCondition> cond;
+    } iset;
+
+    union {
+        BitField<8, 2, Register::Size> dest_size;
+        BitField<10, 2, Register::Size> src_size;
+        BitField<12, 1, u64> is_output_signed;
+        BitField<13, 1, u64> is_input_signed;
         BitField<41, 2, u64> selector;
         BitField<45, 1, u64> negate_a;
         BitField<49, 1, u64> abs_a;
-        BitField<50, 1, u64> saturate_a;
+
+        union {
+            BitField<39, 2, F2iRoundingOp> rounding;
+        } f2i;
+
+        union {
+            BitField<39, 4, F2fRoundingOp> rounding;
+        } f2f;
     } conversion;
 
     union {
         BitField<31, 4, u64> component_mask;
 
         bool IsComponentEnabled(size_t component) const {
-            return ((1 << component) & component_mask) != 0;
+            return ((1ull << component) & component_mask) != 0;
         }
     } tex;
 
@@ -320,7 +391,7 @@ union Instruction {
 
             ASSERT(component_mask_selector < mask.size());
 
-            return ((1 << component) & mask[component_mask_selector]) != 0;
+            return ((1ull << component) & mask[component_mask_selector]) != 0;
         }
     } texs;
 
@@ -338,12 +409,21 @@ union Instruction {
         }
     } bra;
 
+    union {
+        BitField<20, 14, u64> offset;
+        BitField<34, 5, u64> index;
+    } cbuf34;
+
+    union {
+        BitField<20, 16, s64> offset;
+        BitField<36, 5, u64> index;
+    } cbuf36;
+
     BitField<61, 1, u64> is_b_imm;
     BitField<60, 1, u64> is_b_gpr;
     BitField<59, 1, u64> is_c_gpr;
 
     Attribute attribute;
-    Uniform uniform;
     Sampler sampler;
 
     u64 value;
@@ -356,8 +436,13 @@ class OpCode {
 public:
     enum class Id {
         KIL,
+        SSY,
+        BFE_C,
+        BFE_R,
+        BFE_IMM,
         BRA,
         LD_A,
+        LD_C,
         ST_A,
         TEX,
         TEXQ, // Texture Query
@@ -376,6 +461,10 @@ public:
         FMUL_R,
         FMUL_IMM,
         FMUL32_IMM,
+        IADD_C,
+        IADD_R,
+        IADD_IMM,
+        IADD32I,
         ISCADD_C, // Scale and Add
         ISCADD_R,
         ISCADD_IMM,
@@ -395,6 +484,9 @@ public:
         I2I_C,
         I2I_R,
         I2I_IMM,
+        LOP_C,
+        LOP_R,
+        LOP_IMM,
         LOP32I,
         MOV_C,
         MOV_R,
@@ -409,6 +501,9 @@ public:
         FMNMX_C,
         FMNMX_R,
         FMNMX_IMM,
+        IMNMX_C,
+        IMNMX_R,
+        IMNMX_IMM,
         FSETP_C, // Set Predicate
         FSETP_R,
         FSETP_IMM,
@@ -418,20 +513,30 @@ public:
         ISETP_C,
         ISETP_IMM,
         ISETP_R,
+        ISET_R,
+        ISET_C,
+        ISET_IMM,
         PSETP,
+        XMAD_IMM,
+        XMAD_CR,
+        XMAD_RC,
+        XMAD_RR,
     };
 
     enum class Type {
         Trivial,
         Arithmetic,
-        Logic,
+        ArithmeticImmediate,
+        ArithmeticInteger,
+        ArithmeticIntegerImmediate,
+        Bfe,
         Shift,
-        ScaledAdd,
         Ffma,
         Flow,
         Memory,
         FloatSet,
         FloatSetPredicate,
+        IntegerSet,
         IntegerSetPredicate,
         PredicateSetPredicate,
         Conversion,
@@ -530,8 +635,10 @@ private:
         std::vector<Matcher> table = {
 #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
             INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
+            INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
             INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
+            INST("1110111110010---", Id::LD_C, Type::Memory, "LD_C"),
             INST("1110111111110---", Id::ST_A, Type::Memory, "ST_A"),
             INST("1100000000111---", Id::TEX, Type::Memory, "TEX"),
             INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"),
@@ -549,10 +656,14 @@ private:
             INST("0100110001101---", Id::FMUL_C, Type::Arithmetic, "FMUL_C"),
             INST("0101110001101---", Id::FMUL_R, Type::Arithmetic, "FMUL_R"),
             INST("0011100-01101---", Id::FMUL_IMM, Type::Arithmetic, "FMUL_IMM"),
-            INST("00011110--------", Id::FMUL32_IMM, Type::Arithmetic, "FMUL32_IMM"),
-            INST("0100110000011---", Id::ISCADD_C, Type::ScaledAdd, "ISCADD_C"),
-            INST("0101110000011---", Id::ISCADD_R, Type::ScaledAdd, "ISCADD_R"),
-            INST("0011100-00011---", Id::ISCADD_IMM, Type::ScaledAdd, "ISCADD_IMM"),
+            INST("00011110--------", Id::FMUL32_IMM, Type::ArithmeticImmediate, "FMUL32_IMM"),
+            INST("0100110000010---", Id::IADD_C, Type::ArithmeticInteger, "IADD_C"),
+            INST("0101110000010---", Id::IADD_R, Type::ArithmeticInteger, "IADD_R"),
+            INST("0011100-00010---", Id::IADD_IMM, Type::ArithmeticInteger, "IADD_IMM"),
+            INST("0001110---------", Id::IADD32I, Type::ArithmeticIntegerImmediate, "IADD32I"),
+            INST("0100110000011---", Id::ISCADD_C, Type::ArithmeticInteger, "ISCADD_C"),
+            INST("0101110000011---", Id::ISCADD_R, Type::ArithmeticInteger, "ISCADD_R"),
+            INST("0011100-00011---", Id::ISCADD_IMM, Type::ArithmeticInteger, "ISCADD_IMM"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
@@ -560,17 +671,26 @@ private:
             INST("0100110010101---", Id::F2F_C, Type::Conversion, "F2F_C"),
             INST("0101110010101---", Id::F2F_R, Type::Conversion, "F2F_R"),
             INST("0011100-10101---", Id::F2F_IMM, Type::Conversion, "F2F_IMM"),
-            INST("0100110010110---", Id::F2I_C, Type::Arithmetic, "F2I_C"),
-            INST("0101110010110---", Id::F2I_R, Type::Arithmetic, "F2I_R"),
-            INST("0011100-10110---", Id::F2I_IMM, Type::Arithmetic, "F2I_IMM"),
+            INST("0100110010110---", Id::F2I_C, Type::Conversion, "F2I_C"),
+            INST("0101110010110---", Id::F2I_R, Type::Conversion, "F2I_R"),
+            INST("0011100-10110---", Id::F2I_IMM, Type::Conversion, "F2I_IMM"),
             INST("0100110010011---", Id::MOV_C, Type::Arithmetic, "MOV_C"),
             INST("0101110010011---", Id::MOV_R, Type::Arithmetic, "MOV_R"),
             INST("0011100-10011---", Id::MOV_IMM, Type::Arithmetic, "MOV_IMM"),
-            INST("000000010000----", Id::MOV32_IMM, Type::Arithmetic, "MOV32_IMM"),
+            INST("000000010000----", Id::MOV32_IMM, Type::ArithmeticImmediate, "MOV32_IMM"),
             INST("0100110001100---", Id::FMNMX_C, Type::Arithmetic, "FMNMX_C"),
             INST("0101110001100---", Id::FMNMX_R, Type::Arithmetic, "FMNMX_R"),
             INST("0011100-01100---", Id::FMNMX_IMM, Type::Arithmetic, "FMNMX_IMM"),
-            INST("000001----------", Id::LOP32I, Type::Logic, "LOP32I"),
+            INST("0100110000100---", Id::IMNMX_C, Type::Arithmetic, "FMNMX_IMM"),
+            INST("0101110000100---", Id::IMNMX_R, Type::Arithmetic, "FMNMX_IMM"),
+            INST("0011100-00100---", Id::IMNMX_IMM, Type::Arithmetic, "FMNMX_IMM"),
+            INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"),
+            INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"),
+            INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"),
+            INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
+            INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
+            INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
+            INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"),
             INST("0100110001001---", Id::SHL_C, Type::Shift, "SHL_C"),
             INST("0101110001001---", Id::SHL_R, Type::Shift, "SHL_R"),
             INST("0011100-01001---", Id::SHL_IMM, Type::Shift, "SHL_IMM"),
@@ -592,7 +712,14 @@ private:
             INST("010010110110----", Id::ISETP_C, Type::IntegerSetPredicate, "ISETP_C"),
             INST("010110110110----", Id::ISETP_R, Type::IntegerSetPredicate, "ISETP_R"),
             INST("0011011-0110----", Id::ISETP_IMM, Type::IntegerSetPredicate, "ISETP_IMM"),
+            INST("010110110101----", Id::ISET_R, Type::IntegerSet, "ISET_R"),
+            INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
+            INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
             INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
+            INST("0011011-00------", Id::XMAD_IMM, Type::Arithmetic, "XMAD_IMM"),
+            INST("0100111---------", Id::XMAD_CR, Type::Arithmetic, "XMAD_CR"),
+            INST("010100010-------", Id::XMAD_RC, Type::Arithmetic, "XMAD_RC"),
+            INST("0101101100------", Id::XMAD_RR, Type::Arithmetic, "XMAD_RR"),
         };
 #undef INST
         std::stable_sort(table.begin(), table.end(), [](const auto& a, const auto& b) {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 756518ee7..e36483145 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -5,6 +5,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 
 namespace Tegra {
@@ -14,6 +15,7 @@ GPU::GPU() {
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(*memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
 }
 
 GPU::~GPU() = default;
@@ -26,6 +28,10 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     ASSERT(format != RenderTargetFormat::NONE);
 
     switch (format) {
+    case RenderTargetFormat::RGBA32_FLOAT:
+        return 16;
+    case RenderTargetFormat::RGBA16_FLOAT:
+        return 8;
     case RenderTargetFormat::RGBA8_UNORM:
     case RenderTargetFormat::RGB10_A2_UNORM:
         return 4;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index f168a5171..7b4e9b842 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -15,10 +15,12 @@ namespace Tegra {
 
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
+    RGBA32_FLOAT = 0xC0,
     RGBA16_FLOAT = 0xCA,
     RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
     RGBA8_SRGB = 0xD6,
+    R11G11B10_FLOAT = 0xE0,
 };
 
 /// Returns the number of bytes per pixel of each rendertarget format.
@@ -61,6 +63,7 @@ namespace Engines {
 class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
+class MaxwellDMA;
 } // namespace Engines
 
 enum class EngineID {
@@ -101,6 +104,8 @@ private:
     std::unique_ptr<Engines::Fermi2D> fermi_2d;
     /// Compute engine
     std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0a33868b7..3ba20f978 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -196,8 +196,10 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     auto& gpu = Core::System().GetInstance().GPU().Maxwell3D();
     ASSERT_MSG(!gpu.regs.shader_config[0].enable, "VertexA is unsupported!");
 
-    // Next available bindpoint to use when uploading the const buffers to the GLSL shaders.
+    // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
+    // shaders.
     u32 current_constbuffer_bindpoint = 0;
+    u32 current_texture_bindpoint = 0;
 
     for (unsigned index = 1; index < Maxwell::MaxShaderProgram; ++index) {
         auto& shader_config = gpu.regs.shader_config[index];
@@ -212,13 +214,17 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             continue;
         }
 
+        GLShader::MaxwellUniformData ubo{};
+        ubo.SetFromRegs(gpu.state.shader_stages[stage]);
+        std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
+
+        // Flush the buffer so that the GPU can see the data we just wrote.
+        glFlushMappedBufferRange(GL_ARRAY_BUFFER, buffer_offset, sizeof(ubo));
+
         // Upload uniform data as one UBO per stage
         const GLintptr ubo_offset = buffer_offset;
         copy_buffer(uniform_buffers[stage].handle, ubo_offset,
                     sizeof(GLShader::MaxwellUniformData));
-        GLShader::MaxwellUniformData* ub_ptr =
-            reinterpret_cast<GLShader::MaxwellUniformData*>(buffer_ptr);
-        ub_ptr->SetFromRegs(gpu.state.shader_stages[stage]);
 
         buffer_ptr += sizeof(GLShader::MaxwellUniformData);
         buffer_offset += sizeof(GLShader::MaxwellUniformData);
@@ -258,6 +264,11 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
         current_constbuffer_bindpoint =
             SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
                               current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
+
+        // Configure the textures for this shader stage.
+        current_texture_bindpoint =
+            SetupTextures(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
+                          current_texture_bindpoint, shader_resources.texture_samplers);
     }
 
     shader_program_manager->UseTrivialGeometryShader();
@@ -338,12 +349,12 @@ void RasterizerOpenGL::DrawArrays() {
     // Sync the viewport
     SyncViewport(surfaces_rect, res_scale);
 
+    // Sync the blend state registers
+    SyncBlendState();
+
     // TODO(bunnei): Sync framebuffer_scale uniform here
     // TODO(bunnei): Sync scissorbox uniform(s) here
 
-    // Sync and bind the texture surfaces
-    BindTextures();
-
     // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable
     // scissor test to prevent drawing outside of the framebuffer region
     state.scissor.enabled = true;
@@ -447,65 +458,7 @@ void RasterizerOpenGL::DrawArrays() {
     }
 }
 
-void RasterizerOpenGL::BindTextures() {
-    using Regs = Tegra::Engines::Maxwell3D::Regs;
-    auto& maxwell3d = Core::System::GetInstance().GPU().Get3DEngine();
-
-    // Each Maxwell shader stage can have an arbitrary number of textures, but we're limited to a
-    // certain number in OpenGL. We try to only use the minimum amount of host textures by not
-    // keeping a 1:1 relation between guest texture ids and host texture ids, ie, guest texture id 8
-    // can be host texture id 0 if it's the only texture used in the guest shader program.
-    u32 host_texture_index = 0;
-    for (u32 stage = 0; stage < Regs::MaxShaderStage; ++stage) {
-        ASSERT(host_texture_index < texture_samplers.size());
-        const auto textures = maxwell3d.GetStageTextures(static_cast<Regs::ShaderStage>(stage));
-        for (unsigned texture_index = 0; texture_index < textures.size(); ++texture_index) {
-            const auto& texture = textures[texture_index];
-
-            if (texture.enabled) {
-                texture_samplers[host_texture_index].SyncWithConfig(texture.tsc);
-                Surface surface = res_cache.GetTextureSurface(texture);
-                if (surface != nullptr) {
-                    state.texture_units[host_texture_index].texture_2d = surface->texture.handle;
-                } else {
-                    // Can occur when texture addr is null or its memory is unmapped/invalid
-                    state.texture_units[texture_index].texture_2d = 0;
-                }
-
-                ++host_texture_index;
-            } else {
-                state.texture_units[texture_index].texture_2d = 0;
-            }
-        }
-    }
-}
-
-void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {
-    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-    switch (method) {
-    case MAXWELL3D_REG_INDEX(blend.separate_alpha):
-        ASSERT_MSG(false, "unimplemented");
-        break;
-    case MAXWELL3D_REG_INDEX(blend.equation_rgb):
-        state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.blend.equation_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_source_rgb):
-        state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_dest_rgb):
-        state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.equation_a):
-        state.blend.a_equation = MaxwellToGL::BlendEquation(regs.blend.equation_a);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_source_a):
-        state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_a);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_dest_a):
-        state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_a);
-        break;
-    }
-}
+void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {}
 
 void RasterizerOpenGL::FlushAll() {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
@@ -654,7 +607,16 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
         buffer_draw_state.bindpoint = current_bindpoint + bindpoint;
 
         boost::optional<VAddr> addr = gpu.memory_manager->GpuToCpuAddress(buffer.address);
-        std::vector<u8> data(used_buffer.GetSize() * sizeof(float));
+
+        std::vector<u8> data;
+        if (used_buffer.IsIndirect()) {
+            // Buffer is accessed indirectly, so upload the entire thing
+            data.resize(buffer.size * sizeof(float));
+        } else {
+            // Buffer is accessed directly, upload just what we use
+            data.resize(used_buffer.GetSize() * sizeof(float));
+        }
+
         Memory::ReadBlock(*addr, data.data(), data.size());
 
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer_draw_state.ssbo);
@@ -671,7 +633,53 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
 
     state.Apply();
 
-    return current_bindpoint + entries.size();
+    return current_bindpoint + static_cast<u32>(entries.size());
+}
+
+u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
+                                    const std::vector<GLShader::SamplerEntry>& entries) {
+    auto& gpu = Core::System::GetInstance().GPU();
+    auto& maxwell3d = gpu.Get3DEngine();
+
+    ASSERT_MSG(maxwell3d.IsShaderStageEnabled(stage),
+               "Attempted to upload textures of disabled shader stage");
+
+    ASSERT_MSG(current_unit + entries.size() <= std::size(state.texture_units),
+               "Exceeded the number of active textures.");
+
+    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
+        const auto& entry = entries[bindpoint];
+        u32 current_bindpoint = current_unit + bindpoint;
+
+        // Bind the uniform to the sampler.
+        GLint uniform = glGetUniformLocation(program, entry.GetName().c_str());
+        ASSERT(uniform != -1);
+        glProgramUniform1i(program, uniform, current_bindpoint);
+
+        const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset());
+        ASSERT(texture.enabled);
+
+        texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
+        Surface surface = res_cache.GetTextureSurface(texture);
+        if (surface != nullptr) {
+            state.texture_units[current_bindpoint].texture_2d = surface->texture.handle;
+            state.texture_units[current_bindpoint].swizzle.r =
+                MaxwellToGL::SwizzleSource(texture.tic.x_source);
+            state.texture_units[current_bindpoint].swizzle.g =
+                MaxwellToGL::SwizzleSource(texture.tic.y_source);
+            state.texture_units[current_bindpoint].swizzle.b =
+                MaxwellToGL::SwizzleSource(texture.tic.z_source);
+            state.texture_units[current_bindpoint].swizzle.a =
+                MaxwellToGL::SwizzleSource(texture.tic.w_source);
+        } else {
+            // Can occur when texture addr is null or its memory is unmapped/invalid
+            state.texture_units[current_bindpoint].texture_2d = 0;
+        }
+    }
+
+    state.Apply();
+
+    return current_unit + static_cast<u32>(entries.size());
 }
 
 void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
@@ -730,14 +738,21 @@ void RasterizerOpenGL::SyncDepthOffset() {
     UNREACHABLE();
 }
 
-void RasterizerOpenGL::SyncBlendEnabled() {
-    UNREACHABLE();
-}
+void RasterizerOpenGL::SyncBlendState() {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
-void RasterizerOpenGL::SyncBlendFuncs() {
-    UNREACHABLE();
-}
+    // TODO(Subv): Support more than just render target 0.
+    state.blend.enabled = regs.blend.enable[0] != 0;
 
-void RasterizerOpenGL::SyncBlendColor() {
-    UNREACHABLE();
+    if (!state.blend.enabled)
+        return;
+
+    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
+    ASSERT_MSG(!regs.independent_blend[0].separate_alpha, "Unimplemented");
+    state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_rgb);
+    state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_rgb);
+    state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_rgb);
+    state.blend.a_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_a);
+    state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_a);
+    state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_a);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 4b915c76a..b7c8cf843 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -80,9 +80,6 @@ private:
     void BindFramebufferSurfaces(const Surface& color_surface, const Surface& depth_surface,
                                  bool has_stencil);
 
-    /// Binds the required textures to OpenGL before drawing a batch.
-    void BindTextures();
-
     /*
      * Configures the current constbuffers to use for the draw command.
      * @param stage The shader stage to configure buffers for.
@@ -95,6 +92,17 @@ private:
                           u32 current_bindpoint,
                           const std::vector<GLShader::ConstBufferEntry>& entries);
 
+    /*
+     * Configures the current textures to use for the draw command.
+     * @param stage The shader stage to configure textures for.
+     * @param program The OpenGL program object that contains the specified stage.
+     * @param current_unit The offset at which to start counting unused texture units.
+     * @param entries Vector describing the textures that are actually used in the guest shader.
+     * @returns The next available bindpoint for use in the next shader stage.
+     */
+    u32 SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, GLuint program,
+                      u32 current_unit, const std::vector<GLShader::SamplerEntry>& entries);
+
     /// Syncs the viewport to match the guest state
     void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect, u16 res_scale);
 
@@ -113,14 +121,8 @@ private:
     /// Syncs the depth offset to match the guest state
     void SyncDepthOffset();
 
-    /// Syncs the blend enabled status to match the guest state
-    void SyncBlendEnabled();
-
-    /// Syncs the blend functions to match the guest state
-    void SyncBlendFuncs();
-
-    /// Syncs the blend color to match the guest state
-    void SyncBlendColor();
+    /// Syncs the blend state to match the guest state
+    void SyncBlendState();
 
     bool has_ARB_buffer_storage;
     bool has_ARB_direct_state_access;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index d6048f639..61d670dcb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -28,6 +28,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
@@ -50,18 +51,22 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, false},                // A1B5G5R5
     {GL_R8, GL_RED, GL_UNSIGNED_BYTE, false},                                   // R8
     {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                // RGBA16F
+    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},        // R11FG11FB10F
     {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, true},   // DXT1
     {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT23
     {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT45
     {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, true},           // DXN1
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                               // ASTC_2D_4X4
 }};
 
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
     const SurfaceType type = SurfaceParams::GetFormatType(pixel_format);
     if (type == SurfaceType::ColorTexture) {
         ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
-        // For now only UNORM components are supported, or RGBA16F which is type FLOAT
-        ASSERT(component_type == ComponentType::UNorm || pixel_format == PixelFormat::RGBA16F);
+        // For now only UNORM components are supported, or either R11FG11FB10F or RGBA16F which are
+        // type FLOAT
+        ASSERT(component_type == ComponentType::UNorm || pixel_format == PixelFormat::RGBA16F ||
+               pixel_format == PixelFormat::R11FG11FB10F);
         return tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
         // TODO(Subv): Implement depth formats
@@ -83,6 +88,23 @@ static u16 GetResolutionScaleFactor() {
                                 : Settings::values.resolution_factor);
 }
 
+static void ConvertASTCToRGBA8(std::vector<u8>& data, PixelFormat format, u32 width, u32 height) {
+    u32 block_width{};
+    u32 block_height{};
+
+    switch (format) {
+    case PixelFormat::ASTC_2D_4X4:
+        block_width = 4;
+        block_height = 4;
+        break;
+    default:
+        NGLOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
+        UNREACHABLE();
+    }
+
+    data = Tegra::Texture::ASTC::Decompress(data, width, height, block_width, block_height);
+}
+
 template <bool morton_to_gl, PixelFormat format>
 void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr base,
                 Tegra::GPUVAddr start, Tegra::GPUVAddr end) {
@@ -94,6 +116,12 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::
         auto data = Tegra::Texture::UnswizzleTexture(
             *gpu.memory_manager->GpuToCpuAddress(base),
             SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
+
+        if (SurfaceParams::IsFormatASTC(format)) {
+            // ASTC formats are converted to RGBA8 in software, as most PC GPUs do not support this
+            ConvertASTCToRGBA8(data, format, stride, height);
+        }
+
         std::memcpy(gl_buffer, data.data(), data.size());
     } else {
         // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
@@ -110,11 +138,12 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
                                      Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
-        MortonCopy<true, PixelFormat::ABGR8>,       MortonCopy<true, PixelFormat::B5G6R5>,
-        MortonCopy<true, PixelFormat::A2B10G10R10>, MortonCopy<true, PixelFormat::A1B5G5R5>,
-        MortonCopy<true, PixelFormat::R8>,          MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::DXT1>,        MortonCopy<true, PixelFormat::DXT23>,
-        MortonCopy<true, PixelFormat::DXT45>,       MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::ABGR8>,        MortonCopy<true, PixelFormat::B5G6R5>,
+        MortonCopy<true, PixelFormat::A2B10G10R10>,  MortonCopy<true, PixelFormat::A1B5G5R5>,
+        MortonCopy<true, PixelFormat::R8>,           MortonCopy<true, PixelFormat::RGBA16F>,
+        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::DXT1>,
+        MortonCopy<true, PixelFormat::DXT23>,        MortonCopy<true, PixelFormat::DXT45>,
+        MortonCopy<true, PixelFormat::DXN1>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
 };
 
 static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra::GPUVAddr,
@@ -127,11 +156,13 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
         MortonCopy<false, PixelFormat::A1B5G5R5>,
         MortonCopy<false, PixelFormat::R8>,
         MortonCopy<false, PixelFormat::RGBA16F>,
+        MortonCopy<false, PixelFormat::R11FG11FB10F>,
         // TODO(Subv): Swizzling the DXT1/DXT23/DXT45/DXN1 formats is not yet supported
         nullptr,
         nullptr,
         nullptr,
         nullptr,
+        MortonCopy<false, PixelFormat::ABGR8>,
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -164,60 +195,10 @@ static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tup
 static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex,
                          const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type,
                          GLuint read_fb_handle, GLuint draw_fb_handle) {
-    OpenGLState state = OpenGLState::GetCurState();
-
-    OpenGLState prev_state = state;
-    SCOPE_EXIT({ prev_state.Apply(); });
-
-    // Make sure textures aren't bound to texture units, since going to bind them to framebuffer
-    // components
-    state.ResetTexture(src_tex);
-    state.ResetTexture(dst_tex);
-
-    state.draw.read_framebuffer = read_fb_handle;
-    state.draw.draw_framebuffer = draw_fb_handle;
-    state.Apply();
-
-    u32 buffers = 0;
-
-    if (type == SurfaceType::ColorTexture) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src_tex,
-                               0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, dst_tex,
-                               0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-
-        buffers = GL_COLOR_BUFFER_BIT;
-    } else if (type == SurfaceType::Depth) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, src_tex, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, dst_tex, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-
-        buffers = GL_DEPTH_BUFFER_BIT;
-    } else if (type == SurfaceType::DepthStencil) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                               src_tex, 0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                               dst_tex, 0);
-
-        buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
-    }
-
-    glBlitFramebuffer(src_rect.left, src_rect.bottom, src_rect.right, src_rect.top, dst_rect.left,
-                      dst_rect.bottom, dst_rect.right, dst_rect.top, buffers,
-                      buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
 
+    glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0, dst_tex,
+                       GL_TEXTURE_2D, 0, dst_rect.left, dst_rect.bottom, 0, src_rect.GetWidth(),
+                       src_rect.GetHeight(), 0);
     return true;
 }
 
@@ -594,7 +575,7 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
         glCompressedTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format,
                                static_cast<GLsizei>(rect.GetWidth() * GetCompresssionFactor()),
                                static_cast<GLsizei>(rect.GetHeight() * GetCompresssionFactor()), 0,
-                               size, &gl_buffer[buffer_offset]);
+                               static_cast<GLsizei>(size), &gl_buffer[buffer_offset]);
     } else {
         glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
                         static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
@@ -933,9 +914,6 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, ScaleMatc
     // Use GetSurfaceSubRect instead
     ASSERT(params.width == params.stride);
 
-    ASSERT(!params.is_tiled ||
-           (params.GetActualWidth() % 8 == 0 && params.GetActualHeight() % 8 == 0));
-
     // Check for an exact match in existing surfaces
     Surface surface =
         FindMatch<MatchFlags::Exact | MatchFlags::Invalid>(surface_cache, params, match_res_scale);
@@ -1078,8 +1056,11 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
     params.addr = config.tic.Address();
     params.is_tiled = config.tic.IsTiled();
     params.pixel_format = SurfaceParams::PixelFormatFromTextureFormat(config.tic.format);
-    params.width = config.tic.Width() / params.GetCompresssionFactor();
-    params.height = config.tic.Height() / params.GetCompresssionFactor();
+
+    params.width = Common::AlignUp(config.tic.Width(), params.GetCompresssionFactor()) /
+                   params.GetCompresssionFactor();
+    params.height = Common::AlignUp(config.tic.Height(), params.GetCompresssionFactor()) /
+                    params.GetCompresssionFactor();
 
     // TODO(Subv): Different types per component are not supported.
     ASSERT(config.tic.r_type.Value() == config.tic.g_type.Value() &&
@@ -1090,6 +1071,13 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
 
     if (config.tic.IsTiled()) {
         params.block_height = config.tic.BlockHeight();
+
+        // TODO(bunnei): The below align up is a hack. This is here because some compressed textures
+        // are not a multiple of their own compression factor, and so this accounts for that. This
+        // could potentially result in an extra row of 4px being decoded if a texture is not a
+        // multiple of 4.
+        params.width = Common::AlignUp(params.width, 4);
+        params.height = Common::AlignUp(params.height, 4);
     } else {
         // Use the texture-provided stride value if the texture isn't tiled.
         params.stride = static_cast<u32>(params.PixelsInBytes(config.tic.Pitch()));
@@ -1097,23 +1085,6 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
 
     params.UpdateParams();
 
-    if (config.tic.Width() % 8 != 0 || config.tic.Height() % 8 != 0 ||
-        params.stride != params.width) {
-        Surface src_surface;
-        MathUtil::Rectangle<u32> rect;
-        std::tie(src_surface, rect) = GetSurfaceSubRect(params, ScaleMatch::Ignore, true);
-
-        params.res_scale = src_surface->res_scale;
-        Surface tmp_surface = CreateSurface(params);
-        BlitTextures(src_surface->texture.handle, rect, tmp_surface->texture.handle,
-                     tmp_surface->GetScaledRect(),
-                     SurfaceParams::GetFormatType(params.pixel_format), read_framebuffer.handle,
-                     draw_framebuffer.handle);
-
-        remove_surfaces.emplace(tmp_surface);
-        return tmp_surface;
-    }
-
     return GetSurface(params, ScaleMatch::Ignore, true);
 }
 
@@ -1288,7 +1259,7 @@ void RasterizerCacheOpenGL::ValidateSurface(const Surface& surface, Tegra::GPUVA
 
         const auto interval = *it & validate_interval;
         // Look for a valid surface to copy from
-        SurfaceParams params = surface->FromInterval(interval);
+        SurfaceParams params = *surface;
 
         Surface copy_surface =
             FindMatch<MatchFlags::Copy>(surface_cache, params, ScaleMatch::Ignore, interval);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 6f08678ab..9da945e19 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -60,10 +60,12 @@ struct SurfaceParams {
         A1B5G5R5 = 3,
         R8 = 4,
         RGBA16F = 5,
-        DXT1 = 6,
-        DXT23 = 7,
-        DXT45 = 8,
-        DXN1 = 9, // This is also known as BC4
+        R11FG11FB10F = 6,
+        DXT1 = 7,
+        DXT23 = 8,
+        DXT45 = 9,
+        DXN1 = 10, // This is also known as BC4
+        ASTC_2D_4X4 = 11,
 
         Max,
         Invalid = 255,
@@ -104,11 +106,13 @@ struct SurfaceParams {
             1, // A2B10G10R10
             1, // A1B5G5R5
             1, // R8
-            2, // RGBA16F
+            1, // RGBA16F
+            1, // R11FG11FB10F
             4, // DXT1
             4, // DXT23
             4, // DXT45
             4, // DXN1
+            1, // ASTC_2D_4X4
         }};
 
         ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
@@ -129,10 +133,12 @@ struct SurfaceParams {
             16,  // A1B5G5R5
             8,   // R8
             64,  // RGBA16F
+            32,  // R11FG11FB10F
             64,  // DXT1
             128, // DXT23
             128, // DXT45
             64,  // DXN1
+            32,  // ASTC_2D_4X4
         }};
 
         ASSERT(static_cast<size_t>(format) < bpp_table.size());
@@ -151,12 +157,23 @@ struct SurfaceParams {
             return PixelFormat::A2B10G10R10;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
             return PixelFormat::RGBA16F;
+        case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
+            return PixelFormat::R11FG11FB10F;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
 
+    static bool IsFormatASTC(PixelFormat format) {
+        switch (format) {
+        case PixelFormat::ASTC_2D_4X4:
+            return true;
+        default:
+            return false;
+        }
+    }
+
     static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
         switch (format) {
         case Tegra::FramebufferConfig::PixelFormat::ABGR8:
@@ -182,6 +199,8 @@ struct SurfaceParams {
             return PixelFormat::R8;
         case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
             return PixelFormat::RGBA16F;
+        case Tegra::Texture::TextureFormat::BF10GF11RF11:
+            return PixelFormat::R11FG11FB10F;
         case Tegra::Texture::TextureFormat::DXT1:
             return PixelFormat::DXT1;
         case Tegra::Texture::TextureFormat::DXT23:
@@ -190,6 +209,8 @@ struct SurfaceParams {
             return PixelFormat::DXT45;
         case Tegra::Texture::TextureFormat::DXN1:
             return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
+            return PixelFormat::ASTC_2D_4X4;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
@@ -211,6 +232,8 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::R8;
         case PixelFormat::RGBA16F:
             return Tegra::Texture::TextureFormat::R16_G16_B16_A16;
+        case PixelFormat::R11FG11FB10F:
+            return Tegra::Texture::TextureFormat::BF10GF11RF11;
         case PixelFormat::DXT1:
             return Tegra::Texture::TextureFormat::DXT1;
         case PixelFormat::DXT23:
@@ -219,6 +242,8 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::DXT45;
         case PixelFormat::DXN1:
             return Tegra::Texture::TextureFormat::DXN1;
+        case PixelFormat::ASTC_2D_4X4:
+            return Tegra::Texture::TextureFormat::ASTC_2D_4X4;
         default:
             UNREACHABLE();
         }
@@ -243,6 +268,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
             return ComponentType::UNorm;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
+        case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return ComponentType::Float;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f886e49ca..65fed77ef 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -16,11 +16,11 @@ namespace Decompiler {
 
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
+using Tegra::Shader::LogicOperation;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 using Tegra::Shader::Sampler;
 using Tegra::Shader::SubOp;
-using Tegra::Shader::Uniform;
 
 constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH;
 
@@ -267,6 +267,27 @@ public:
     }
 
     /**
+     * Returns code that does an integer size conversion for the specified size.
+     * @param value Value to perform integer size conversion on.
+     * @param size Register size to use for conversion instructions.
+     * @returns GLSL string corresponding to the value converted to the specified size.
+     */
+    static std::string ConvertIntegerSize(const std::string& value, Register::Size size) {
+        switch (size) {
+        case Register::Size::Byte:
+            return "((" + value + " << 24) >> 24)";
+        case Register::Size::Short:
+            return "((" + value + " << 16) >> 16)";
+        case Register::Size::Word:
+            // Default - do nothing
+            return value;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented conversion size {}", static_cast<u32>(size));
+            UNREACHABLE();
+        }
+    }
+
+    /**
      * Gets a register as an float.
      * @param reg The register to get.
      * @param elem The element to use for the operation.
@@ -282,15 +303,18 @@ public:
      * @param reg The register to get.
      * @param elem The element to use for the operation.
      * @param is_signed Whether to get the register as a signed (or unsigned) integer.
+     * @param size Register size to use for conversion instructions.
      * @returns GLSL string corresponding to the register as an integer.
      */
-    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0,
-                                     bool is_signed = true) {
+    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0, bool is_signed = true,
+                                     Register::Size size = Register::Size::Word) {
         const std::string func = GetGLSLConversionFunc(
             GLSLRegister::Type::Float,
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger);
 
-        return func + '(' + GetRegister(reg, elem) + ')';
+        std::string value = func + '(' + GetRegister(reg, elem) + ')';
+
+        return ConvertIntegerSize(value, size);
     }
 
     /**
@@ -300,13 +324,15 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value,
-                            u64 dest_num_components, u64 value_num_components, bool is_abs = false,
-                            u64 dest_elem = 0) {
-        SetRegister(reg, elem, value, dest_num_components, value_num_components, is_abs, dest_elem);
+                            u64 dest_num_components, u64 value_num_components,
+                            bool is_saturated = false, u64 dest_elem = 0) {
+
+        SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value,
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -316,18 +342,22 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
+     * @param size Register size to use for conversion instructions.
      */
     void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                               const std::string& value, u64 dest_num_components,
-                              u64 value_num_components, bool is_abs = false, u64 dest_elem = 0) {
+                              u64 value_num_components, bool is_saturated = false,
+                              u64 dest_elem = 0, Register::Size size = Register::Size::Word) {
+        ASSERT_MSG(!is_saturated, "Unimplemented");
+
         const std::string func = GetGLSLConversionFunc(
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger,
             GLSLRegister::Type::Float);
 
-        SetRegister(reg, elem, func + '(' + value + ')', dest_num_components, value_num_components,
-                    is_abs, dest_elem);
+        SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')',
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -365,11 +395,9 @@ public:
     }
 
     /// Generates code representing a uniform (C buffer) register, interpreted as the input type.
-    std::string GetUniform(const Uniform& uniform, GLSLRegister::Type type) {
-        declr_const_buffers[uniform.index].MarkAsUsed(static_cast<unsigned>(uniform.index),
-                                                      static_cast<unsigned>(uniform.offset), stage);
-        std::string value =
-            'c' + std::to_string(uniform.index) + '[' + std::to_string(uniform.offset) + ']';
+    std::string GetUniform(u64 index, u64 offset, GLSLRegister::Type type) {
+        declr_const_buffers[index].MarkAsUsed(index, offset, stage);
+        std::string value = 'c' + std::to_string(index) + '[' + std::to_string(offset) + ']';
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -380,10 +408,19 @@ public:
         }
     }
 
-    /// Generates code representing a uniform (C buffer) register, interpreted as the type of the
-    /// destination register.
-    std::string GetUniform(const Uniform& uniform, const Register& dest_reg) {
-        return GetUniform(uniform, regs[dest_reg].GetActiveType());
+    std::string GetUniformIndirect(u64 index, s64 offset, const Register& index_reg,
+                                   GLSLRegister::Type type) {
+        declr_const_buffers[index].MarkAsUsedIndirect(index, stage);
+        std::string value = 'c' + std::to_string(index) + "[(floatBitsToInt(" +
+                            GetRegister(index_reg, 0) + ") + " + std::to_string(offset) + ") / 4]";
+
+        if (type == GLSLRegister::Type::Float) {
+            return value;
+        } else if (type == GLSLRegister::Type::Integer) {
+            return "floatBitsToInt(" + value + ')';
+        } else {
+            UNREACHABLE();
+        }
     }
 
     /// Add declarations for registers
@@ -425,6 +462,14 @@ public:
             ++const_buffer_layout;
         }
         declarations.AddNewLine();
+
+        // Append the sampler2D array for the used textures.
+        size_t num_samplers = GetSamplers().size();
+        if (num_samplers > 0) {
+            declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' +
+                                 std::to_string(num_samplers) + "];");
+            declarations.AddNewLine();
+        }
     }
 
     /// Returns a list of constant buffer declarations
@@ -435,6 +480,32 @@ public:
         return result;
     }
 
+    /// Returns a list of samplers used in the shader
+    std::vector<SamplerEntry> GetSamplers() const {
+        return used_samplers;
+    }
+
+    /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
+    /// necessary.
+    std::string AccessSampler(const Sampler& sampler) {
+        size_t offset = static_cast<size_t>(sampler.index.Value());
+
+        // If this sampler has already been used, return the existing mapping.
+        auto itr =
+            std::find_if(used_samplers.begin(), used_samplers.end(),
+                         [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
+
+        if (itr != used_samplers.end()) {
+            return itr->GetName();
+        }
+
+        // Otherwise create a new mapping for this sampler
+        size_t next_index = used_samplers.size();
+        SamplerEntry entry{stage, offset, next_index};
+        used_samplers.emplace_back(entry);
+        return entry.GetName();
+    }
+
 private:
     /// Build GLSL conversion function, e.g. floatBitsToInt, intBitsToFloat, etc.
     const std::string GetGLSLConversionFunc(GLSLRegister::Type src, GLSLRegister::Type dest) const {
@@ -460,13 +531,11 @@ private:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegister(const Register& reg, u64 elem, const std::string& value,
-                     u64 dest_num_components, u64 value_num_components, bool is_abs,
-                     u64 dest_elem) {
-        std::string dest = GetRegister(reg, dest_elem);
+                     u64 dest_num_components, u64 value_num_components, u64 dest_elem) {
+        std::string dest = GetRegister(reg, static_cast<u32>(dest_elem));
         if (dest_num_components > 1) {
             dest += GetSwizzle(elem);
         }
@@ -476,8 +545,6 @@ private:
             src += GetSwizzle(elem);
         }
 
-        src = is_abs ? "abs(" + src + ')' : src;
-
         shader.AddLine(dest + " = " + src + ';');
     }
 
@@ -498,7 +565,7 @@ private:
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
             // shader.
             ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
-            return "vec4(0, 0, gl_InstanceID, gl_VertexID)";
+            return "vec4(0, 0, uintBitsToFloat(gl_InstanceID), uintBitsToFloat(gl_VertexID))";
         default:
             const u32 index{static_cast<u32>(attribute) -
                             static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -544,6 +611,7 @@ private:
     std::set<Attribute::Index> declr_input_attribute;
     std::set<Attribute::Index> declr_output_attribute;
     std::array<ConstBufferEntry, Maxwell3D::Regs::MaxConstBuffers> declr_const_buffers;
+    std::vector<SamplerEntry> used_samplers;
     const Maxwell3D::Regs::ShaderStage& stage;
 };
 
@@ -563,7 +631,7 @@ public:
 
     /// Returns entries in the shader that are useful for external functions
     ShaderEntries GetEntries() const {
-        return {regs.GetConstBuffersDeclarations()};
+        return {regs.GetConstBuffersDeclarations(), regs.GetSamplers()};
     }
 
 private:
@@ -585,12 +653,8 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler) const {
-        // TODO(Subv): Support more than just texture sampler 0
-        ASSERT_MSG(sampler.index == Sampler::Index::Sampler_0, "unsupported");
-        const unsigned index{static_cast<unsigned>(sampler.index.Value()) -
-                             static_cast<unsigned>(Sampler::Index::Sampler_0)};
-        return "tex[" + std::to_string(index) + ']';
+    std::string GetSampler(const Sampler& sampler) {
+        return regs.AccessSampler(sampler);
     }
 
     /**
@@ -696,6 +760,31 @@ private:
         return (absolute_offset % SchedPeriod) == 0;
     }
 
+    void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
+                             const std::string& op_b) {
+        switch (logic_op) {
+        case LogicOperation::And: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Or: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Xor: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::PassB: {
+            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            break;
+        }
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
+            UNREACHABLE();
+        }
+    }
+
     /**
      * Compiles a single instruction from Tegra to GLSL.
      * @param offset the offset of the Tegra shader instruction.
@@ -733,21 +822,25 @@ private:
 
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
-            std::string op_a = instr.alu.negate_a ? "-" : "";
-            op_a += regs.GetRegisterAsFloat(instr.gpr8);
+            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
             if (instr.alu.abs_a) {
                 op_a = "abs(" + op_a + ')';
             }
 
-            std::string op_b = instr.alu.negate_b ? "-" : "";
+            if (instr.alu.negate_a) {
+                op_a = "-(" + op_a + ')';
+            }
+
+            std::string op_b;
 
             if (instr.is_b_imm) {
-                op_b += GetImmediate19(instr);
+                op_b = GetImmediate19(instr);
             } else {
                 if (instr.is_b_gpr) {
-                    op_b += regs.GetRegisterAsFloat(instr.gpr20);
+                    op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, instr.gpr0);
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Float);
                 }
             }
 
@@ -755,6 +848,10 @@ private:
                 op_b = "abs(" + op_b + ')';
             }
 
+            if (instr.alu.negate_b) {
+                op_b = "-(" + op_b + ')';
+            }
+
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
@@ -762,58 +859,49 @@ private:
                 break;
             }
 
-            case OpCode::Id::MOV32_IMM: {
-                // mov32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
-                break;
-            }
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
-                break;
-            }
-            case OpCode::Id::FMUL32_IMM: {
-                // fmul32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(
-                    instr.gpr0, 0,
-                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Sin:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Ex2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Lg2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rcp:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1,
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rsq:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Min:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "min(" + op_a + "," + op_b + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 default:
                     NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
@@ -850,52 +938,49 @@ private:
             }
             break;
         }
-        case OpCode::Type::Logic: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, false);
-
-            if (instr.alu.lop.invert_a)
-                op_a = "~(" + op_a + ')';
-
+        case OpCode::Type::ArithmeticImmediate: {
             switch (opcode->GetId()) {
-            case OpCode::Id::LOP32I: {
-                u32 imm = static_cast<u32>(instr.alu.imm20_32.Value());
+            case OpCode::Id::MOV32_IMM: {
+                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
+                break;
+            }
+            case OpCode::Id::FMUL32_IMM: {
+                regs.SetRegisterToFloat(
+                    instr.gpr0, 0,
+                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                break;
+            }
+            }
+            break;
+        }
+        case OpCode::Type::Bfe: {
+            ASSERT_MSG(!instr.bfe.negate_b, "Unimplemented");
 
-                if (instr.alu.lop.invert_b)
-                    imm = ~imm;
+            std::string op_a = instr.bfe.negate_a ? "-" : "";
+            op_a += regs.GetRegisterAsInteger(instr.gpr8);
 
-                switch (instr.alu.lop.operation) {
-                case Tegra::Shader::LogicOperation::And: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " & " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Or: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " | " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Xor: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " ^ " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented lop32i operation: {}",
-                                   static_cast<u32>(instr.alu.lop.operation.Value()));
-                    UNREACHABLE();
-                }
+            switch (opcode->GetId()) {
+            case OpCode::Id::BFE_IMM: {
+                std::string inner_shift =
+                    '(' + op_a + " << " + std::to_string(instr.bfe.GetLeftShiftValue()) + ')';
+                std::string outer_shift =
+                    '(' + inner_shift + " >> " +
+                    std::to_string(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position) + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, outer_shift, 1, 1);
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled logic instruction: {}", opcode->GetName());
+                NGLOG_CRITICAL(HW_GPU, "Unhandled BFE instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
+
             break;
         }
 
         case OpCode::Type::Shift: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, false);
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
             std::string op_b;
 
             if (instr.is_b_imm) {
@@ -904,11 +989,25 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsInteger(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
                 }
             }
 
             switch (opcode->GetId()) {
+            case OpCode::Id::SHR_C:
+            case OpCode::Id::SHR_R:
+            case OpCode::Id::SHR_IMM: {
+                if (!instr.shift.is_signed) {
+                    // Logical shift right
+                    op_a = "uint(" + op_a + ')';
+                }
+
+                // Cast to int is superfluous for arithmetic shift, it's only for a logical shift
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(" + op_a + " >> " + op_b + ')',
+                                          1, 1);
+                break;
+            }
             case OpCode::Id::SHL_C:
             case OpCode::Id::SHL_R:
             case OpCode::Id::SHL_IMM:
@@ -922,28 +1021,101 @@ private:
             break;
         }
 
-        case OpCode::Type::ScaledAdd: {
+        case OpCode::Type::ArithmeticIntegerImmediate: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b = std::to_string(instr.alu.imm20_32.Value());
 
-            if (instr.iscadd.negate_a)
-                op_a = '-' + op_a;
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD32I:
+                if (instr.iadd32i.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.iadd32i.saturate != 0);
+                break;
+            case OpCode::Id::LOP32I: {
+                if (instr.alu.lop32i.invert_a)
+                    op_a = "~(" + op_a + ')';
 
-            std::string op_b = instr.iscadd.negate_b ? "-" : "";
+                if (instr.alu.lop32i.invert_b)
+                    op_b = "~(" + op_b + ')';
 
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticIntegerImmediate instruction: {}",
+                               opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::ArithmeticInteger: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b;
             if (instr.is_b_imm) {
                 op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
             } else {
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsInteger(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
                 }
             }
 
-            std::string shift = std::to_string(instr.iscadd.shift_amount.Value());
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD_C:
+            case OpCode::Id::IADD_R:
+            case OpCode::Id::IADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.alu.saturate_d);
+                break;
+            }
+            case OpCode::Id::ISCADD_C:
+            case OpCode::Id::ISCADD_R:
+            case OpCode::Id::ISCADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
+                std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0,
+                                          "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
+                break;
+            }
+            case OpCode::Id::LOP_C:
+            case OpCode::Id::LOP_R:
+            case OpCode::Id::LOP_IMM: {
+                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
+                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
+
+                if (instr.alu.lop.invert_a)
+                    op_a = "~(" + op_a + ')';
+
+                if (instr.alu.lop.invert_b)
+                    op_b = "~(" + op_b + ')';
+
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
+                               opcode->GetName());
+                UNREACHABLE();
+            }
+            }
 
-            regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                      "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
             break;
         }
         case OpCode::Type::Ffma: {
@@ -953,7 +1125,8 @@ private:
 
             switch (opcode->GetId()) {
             case OpCode::Id::FFMA_CR: {
-                op_b += regs.GetUniform(instr.uniform, instr.gpr0);
+                op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        GLSLRegister::Type::Float);
                 op_c += regs.GetRegisterAsFloat(instr.gpr39);
                 break;
             }
@@ -964,7 +1137,8 @@ private:
             }
             case OpCode::Id::FFMA_RC: {
                 op_b += regs.GetRegisterAsFloat(instr.gpr39);
-                op_c += regs.GetUniform(instr.uniform, instr.gpr0);
+                op_c += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        GLSLRegister::Type::Float);
                 break;
             }
             case OpCode::Id::FFMA_IMM: {
@@ -978,31 +1152,33 @@ private:
             }
             }
 
-            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1);
+            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1,
+                                    instr.alu.saturate_d);
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(instr.conversion.size == Register::Size::Word, "Unimplemented");
             ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-            ASSERT_MSG(!instr.conversion.saturate_a, "Unimplemented");
 
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
 
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_signed);
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
-                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_signed, 0, op_a, 1, 1);
+                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
+                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
                 break;
             }
             case OpCode::Id::I2F_R: {
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_signed);
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
@@ -1012,13 +1188,71 @@ private:
                 break;
             }
             case OpCode::Id::F2F_R: {
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                 std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
 
+                switch (instr.conversion.f2f.rounding) {
+                case Tegra::Shader::F2fRoundingOp::None:
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Floor:
+                    op_a = "floor(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Ceil:
+                    op_a = "ceil(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Trunc:
+                    op_a = "trunc(" + op_a + ')';
+                    break;
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2f rounding mode {}",
+                                   static_cast<u32>(instr.conversion.f2f.rounding.Value()));
+                    UNREACHABLE();
+                    break;
+                }
+
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
+                break;
+            }
+            case OpCode::Id::F2I_R: {
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
+                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
+
+                if (instr.conversion.abs_a) {
+                    op_a = "abs(" + op_a + ')';
+                }
+
+                switch (instr.conversion.f2i.rounding) {
+                case Tegra::Shader::F2iRoundingOp::None:
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Floor:
+                    op_a = "floor(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Ceil:
+                    op_a = "ceil(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Trunc:
+                    op_a = "trunc(" + op_a + ')';
+                    break;
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2i rounding mode {}",
+                                   static_cast<u32>(instr.conversion.f2i.rounding.Value()));
+                    UNREACHABLE();
+                    break;
+                }
+
+                if (instr.conversion.is_output_signed) {
+                    op_a = "int(" + op_a + ')';
+                } else {
+                    op_a = "uint(" + op_a + ')';
+                }
+
+                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
+                                          1, false, 0, instr.conversion.dest_size);
                 break;
             }
             default: {
@@ -1029,36 +1263,60 @@ private:
             break;
         }
         case OpCode::Type::Memory: {
-            const Attribute::Index attribute = instr.attribute.fmt20.index;
-
             switch (opcode->GetId()) {
             case OpCode::Id::LD_A: {
                 ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
                 regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element,
-                                                attribute);
+                                                instr.attribute.fmt20.index);
+                break;
+            }
+            case OpCode::Id::LD_C: {
+                ASSERT_MSG(instr.ld_c.unknown == 0, "Unimplemented");
+
+                std::string op_a =
+                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, instr.gpr8,
+                                            GLSLRegister::Type::Float);
+                std::string op_b =
+                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4, instr.gpr8,
+                                            GLSLRegister::Type::Float);
+
+                switch (instr.ld_c.type.Value()) {
+                case Tegra::Shader::UniformType::Single:
+                    regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                    break;
+
+                case Tegra::Shader::UniformType::Double:
+                    regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                    regs.SetRegisterToFloat(instr.gpr0.Value() + 1, 0, op_b, 1, 1);
+                    break;
+
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unhandled type: {}",
+                                   static_cast<unsigned>(instr.ld_c.type.Value()));
+                    UNREACHABLE();
+                }
                 break;
             }
             case OpCode::Id::ST_A: {
                 ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetOutputAttributeToRegister(attribute, instr.attribute.fmt20.element,
-                                                  instr.gpr0);
+                regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index,
+                                                  instr.attribute.fmt20.element, instr.gpr0);
                 break;
             }
             case OpCode::Id::TEX: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 4, "untested");
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                 const std::string sampler = GetSampler(instr.sampler);
                 const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
-                // Add an extra scope and declare the texture coords inside to prevent overwriting
-                // them in case they are used as outputs of the texs instruction.
+                // Add an extra scope and declare the texture coords inside to prevent
+                // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
                 ++shader.scope;
                 shader.AddLine(coord);
                 const std::string texture = "texture(" + sampler + ", coords)";
 
                 size_t dest_elem{};
-                for (size_t elem = 0; elem < instr.attribute.fmt20.size; ++elem) {
+                for (size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -1071,7 +1329,6 @@ private:
                 break;
             }
             case OpCode::Id::TEXS: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 4, "untested");
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 const std::string sampler = GetSampler(instr.sampler);
@@ -1083,8 +1340,8 @@ private:
                 shader.AddLine(coord);
                 const std::string texture = "texture(" + sampler + ", coords)";
 
-                // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA goes
-                // into gpr28+0 and gpr28+1
+                // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA
+                // goes into gpr28+0 and gpr28+1
                 size_t offset{};
 
                 for (const auto& dest : {instr.gpr0.Value(), instr.gpr28.Value()}) {
@@ -1134,7 +1391,8 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Float);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Float);
                 }
             }
 
@@ -1167,15 +1425,17 @@ private:
         }
         case OpCode::Type::IntegerSetPredicate: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed);
+            std::string op_b;
 
-            std::string op_b{};
-
-            ASSERT_MSG(!instr.is_b_imm, "ISETP_IMM not implemented");
-
-            if (instr.is_b_gpr) {
-                op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.isetp.is_signed);
+            if (instr.is_b_imm) {
+                op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
             } else {
-                op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                if (instr.is_b_gpr) {
+                    op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.isetp.is_signed);
+                } else {
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
+                }
             }
 
             using Tegra::Shader::Pred;
@@ -1221,7 +1481,8 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Float);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Float);
                 }
             }
 
@@ -1229,8 +1490,8 @@ private:
                 op_b = "abs(" + op_b + ')';
             }
 
-            // The fset instruction sets a register to 1.0 if the condition is true, and to 0
-            // otherwise.
+            // The fset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
+            // condition is true, and to 0 otherwise.
             std::string second_pred =
                 GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0);
 
@@ -1248,6 +1509,41 @@ private:
             }
             break;
         }
+        case OpCode::Type::IntegerSet: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed);
+
+            std::string op_b;
+
+            if (instr.is_b_imm) {
+                op_b = std::to_string(instr.alu.GetSignedImm20_20());
+            } else {
+                if (instr.is_b_gpr) {
+                    op_b = regs.GetRegisterAsInteger(instr.gpr20, 0, instr.iset.is_signed);
+                } else {
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Integer);
+                }
+            }
+
+            // The iset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
+            // condition is true, and to 0 otherwise.
+            std::string second_pred =
+                GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0);
+
+            std::string comparator = GetPredicateComparison(instr.iset.cond);
+            std::string combiner = GetPredicateCombiner(instr.iset.op);
+
+            std::string predicate = "(((" + op_a + ") " + comparator + " (" + op_b + ")) " +
+                                    combiner + " (" + second_pred + "))";
+
+            if (instr.iset.bf) {
+                regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
+            } else {
+                regs.SetRegisterToInteger(instr.gpr0, false, 0, predicate + " ? 0xFFFFFFFF : 0", 1,
+                                          1);
+            }
+            break;
+        }
         default: {
             switch (opcode->GetId()) {
             case OpCode::Id::EXIT: {
@@ -1261,8 +1557,8 @@ private:
 
                 shader.AddLine("return true;");
                 if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
-                    // If this is an unconditional exit then just end processing here, otherwise we
-                    // have to account for the possibility of the condition not being met, so
+                    // If this is an unconditional exit then just end processing here, otherwise
+                    // we have to account for the possibility of the condition not being met, so
                     // continue processing the next instruction.
                     offset = PROGRAM_END - 1;
                 }
@@ -1284,6 +1580,11 @@ private:
                 regs.SetRegisterToInputAttibute(instr.gpr0, attribute.element, attribute.index);
                 break;
             }
+            case OpCode::Id::SSY: {
+                // The SSY opcode tells the GPU where to re-converge divergent execution paths, we
+                // can ignore this when generating GLSL code.
+                break;
+            }
             default: {
                 NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
                 UNREACHABLE();
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 254f6e2c3..c1e6fac9f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -39,6 +39,10 @@ void main() {
     // Viewport can be flipped, which is unsupported by glViewport
     position.xy *= viewport_flip.xy;
     gl_Position = position;
+
+    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
+    // For now, this is here to bring order in lieu of proper emulation
+    position.w = 1.0;
 }
 )";
     out += program.first;
@@ -62,8 +66,6 @@ layout (std140) uniform fs_config {
     vec4 viewport_flip;
 };
 
-uniform sampler2D tex[32];
-
 void main() {
     exec_shader();
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 458032b5c..ed890e0f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -22,17 +22,28 @@ class ConstBufferEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    void MarkAsUsed(unsigned index, unsigned offset, Maxwell::ShaderStage stage) {
+    void MarkAsUsed(u64 index, u64 offset, Maxwell::ShaderStage stage) {
         is_used = true;
-        this->index = index;
+        this->index = static_cast<unsigned>(index);
+        this->stage = stage;
+        max_offset = std::max(max_offset, static_cast<unsigned>(offset));
+    }
+
+    void MarkAsUsedIndirect(u64 index, Maxwell::ShaderStage stage) {
+        is_used = true;
+        is_indirect = true;
+        this->index = static_cast<unsigned>(index);
         this->stage = stage;
-        max_offset = std::max(max_offset, offset);
     }
 
     bool IsUsed() const {
         return is_used;
     }
 
+    bool IsIndirect() const {
+        return is_indirect;
+    }
+
     unsigned GetIndex() const {
         return index;
     }
@@ -51,13 +62,54 @@ private:
     };
 
     bool is_used{};
+    bool is_indirect{};
     unsigned index{};
     unsigned max_offset{};
     Maxwell::ShaderStage stage;
 };
 
+class SamplerEntry {
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+public:
+    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index)
+        : offset(offset), stage(stage), sampler_index(index) {}
+
+    size_t GetOffset() const {
+        return offset;
+    }
+
+    size_t GetIndex() const {
+        return sampler_index;
+    }
+
+    Maxwell::ShaderStage GetStage() const {
+        return stage;
+    }
+
+    std::string GetName() const {
+        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' +
+               std::to_string(sampler_index) + ']';
+    }
+
+    static std::string GetArrayName(Maxwell::ShaderStage stage) {
+        return TextureSamplerNames[static_cast<size_t>(stage)];
+    }
+
+private:
+    static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = {
+        "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs",
+    };
+    /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
+    /// instruction.
+    size_t offset;
+    Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used.
+    size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+};
+
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffer_entries;
+    std::vector<SamplerEntry> texture_samplers;
 };
 
 using ProgramResult = std::pair<std::string, ShaderEntries>;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index ccdfc2718..d7167b298 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -32,33 +32,14 @@ void SetShaderUniformBlockBindings(GLuint shader) {
                                  sizeof(MaxwellUniformData));
 }
 
-void SetShaderSamplerBindings(GLuint shader) {
-    OpenGLState cur_state = OpenGLState::GetCurState();
-    GLuint old_program = std::exchange(cur_state.draw.shader_program, shader);
-    cur_state.Apply();
-
-    // Set the texture samplers to correspond to different texture units
-    for (u32 texture = 0; texture < NumTextureSamplers; ++texture) {
-        // Set the texture samplers to correspond to different texture units
-        std::string uniform_name = "tex[" + std::to_string(texture) + "]";
-        GLint uniform_tex = glGetUniformLocation(shader, uniform_name.c_str());
-        if (uniform_tex != -1) {
-            glUniform1i(uniform_tex, TextureUnits::MaxwellTexture(texture).id);
-        }
-    }
-
-    cur_state.draw.shader_program = old_program;
-    cur_state.Apply();
-}
-
 } // namespace Impl
 
 void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
     // TODO(bunnei): Support more than one viewport
-    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0 : 1.0;
-    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0 : 1.0;
+    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
+    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 }
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index e963b4b7e..4295c20a6 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -45,7 +45,6 @@ public:
         shader.Create(program_result.first.c_str(), type);
         program.Create(true, shader.handle);
         Impl::SetShaderUniformBlockBindings(program.handle);
-        Impl::SetShaderSamplerBindings(program.handle);
         entries = program_result.second;
     }
     GLuint GetHandle() const {
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index f91dfe36a..1f1e48425 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -50,6 +50,10 @@ OpenGLState::OpenGLState() {
     for (auto& texture_unit : texture_units) {
         texture_unit.texture_2d = 0;
         texture_unit.sampler = 0;
+        texture_unit.swizzle.r = GL_RED;
+        texture_unit.swizzle.g = GL_GREEN;
+        texture_unit.swizzle.b = GL_BLUE;
+        texture_unit.swizzle.a = GL_ALPHA;
     }
 
     lighting_lut.texture_buffer = 0;
@@ -192,13 +196,22 @@ void OpenGLState::Apply() const {
     }
 
     // Textures
-    for (size_t i = 0; i < std::size(texture_units); ++i) {
+    for (int i = 0; i < std::size(texture_units); ++i) {
         if (texture_units[i].texture_2d != cur_state.texture_units[i].texture_2d) {
             glActiveTexture(TextureUnits::MaxwellTexture(i).Enum());
             glBindTexture(GL_TEXTURE_2D, texture_units[i].texture_2d);
         }
         if (texture_units[i].sampler != cur_state.texture_units[i].sampler) {
-            glBindSampler(i, texture_units[i].sampler);
+            glBindSampler(static_cast<GLuint>(i), texture_units[i].sampler);
+        }
+        // Update the texture swizzle
+        if (texture_units[i].swizzle.r != cur_state.texture_units[i].swizzle.r ||
+            texture_units[i].swizzle.g != cur_state.texture_units[i].swizzle.g ||
+            texture_units[i].swizzle.b != cur_state.texture_units[i].swizzle.b ||
+            texture_units[i].swizzle.a != cur_state.texture_units[i].swizzle.a) {
+            std::array<GLint, 4> mask = {texture_units[i].swizzle.r, texture_units[i].swizzle.g,
+                                         texture_units[i].swizzle.b, texture_units[i].swizzle.a};
+            glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, mask.data());
         }
     }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 75c08e645..839e50e93 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -85,6 +85,12 @@ public:
     struct {
         GLuint texture_2d; // GL_TEXTURE_BINDING_2D
         GLuint sampler;    // GL_SAMPLER_BINDING
+        struct {
+            GLint r; // GL_TEXTURE_SWIZZLE_R
+            GLint g; // GL_TEXTURE_SWIZZLE_G
+            GLint b; // GL_TEXTURE_SWIZZLE_B
+            GLint a; // GL_TEXTURE_SWIZZLE_A
+        } swizzle;
     } texture_units[32];
 
     struct {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index a630610d8..2155fb019 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -100,6 +100,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     switch (wrap_mode) {
     case Tegra::Texture::WrapMode::Wrap:
         return GL_REPEAT;
+    case Tegra::Texture::WrapMode::Mirror:
+        return GL_MIRRORED_REPEAT;
     case Tegra::Texture::WrapMode::ClampToEdge:
         return GL_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::ClampOGL:
@@ -178,4 +180,25 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     return {};
 }
 
+inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
+    switch (source) {
+    case Tegra::Texture::SwizzleSource::Zero:
+        return GL_ZERO;
+    case Tegra::Texture::SwizzleSource::R:
+        return GL_RED;
+    case Tegra::Texture::SwizzleSource::G:
+        return GL_GREEN;
+    case Tegra::Texture::SwizzleSource::B:
+        return GL_BLUE;
+    case Tegra::Texture::SwizzleSource::A:
+        return GL_ALPHA;
+    case Tegra::Texture::SwizzleSource::OneInt:
+    case Tegra::Texture::SwizzleSource::OneFloat:
+        return GL_ONE;
+    }
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNREACHABLE();
+    return {};
+}
+
 } // namespace MaxwellToGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3440d2190..f33766bfd 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,6 +316,7 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     }};
 
     state.texture_units[0].texture_2d = screen_info.display_texture;
+    state.texture_units[0].swizzle = {GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA};
     state.Apply();
 
     glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices.data());
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..3c4ad1c9d
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1646 @@
+// Copyright 2016 The University of North Carolina at Chapel Hill
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+// <http://gamma.cs.unc.edu/FasTC/>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "video_core/textures/astc.h"
+
+class BitStream {
+public:
+    BitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0)
+        : m_BitsWritten(0), m_BitsRead(0), m_NumBits(nBits), m_CurByte(ptr),
+          m_NextBit(start_offset % 8), done(false) {}
+
+    int GetBitsWritten() const {
+        return m_BitsWritten;
+    }
+
+    ~BitStream() {}
+
+    void WriteBitsR(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> (nBits - i - 1)) & 1);
+        }
+    }
+
+    void WriteBits(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> i) & 1);
+        }
+    }
+
+    int GetBitsRead() const {
+        return m_BitsRead;
+    }
+
+    int ReadBit() {
+
+        int bit = *m_CurByte >> m_NextBit++;
+        while (m_NextBit >= 8) {
+            m_NextBit -= 8;
+            m_CurByte++;
+        }
+
+        m_BitsRead++;
+        return bit & 1;
+    }
+
+    unsigned int ReadBits(unsigned int nBits) {
+        unsigned int ret = 0;
+        for (unsigned int i = 0; i < nBits; i++) {
+            ret |= (ReadBit() & 1) << i;
+        }
+        return ret;
+    }
+
+private:
+    void WriteBit(int b) {
+
+        if (done)
+            return;
+
+        const unsigned int mask = 1 << m_NextBit++;
+
+        // clear the bit
+        *m_CurByte &= ~mask;
+
+        // Write the bit, if necessary
+        if (b)
+            *m_CurByte |= mask;
+
+        // Next byte?
+        if (m_NextBit >= 8) {
+            m_CurByte += 1;
+            m_NextBit = 0;
+        }
+
+        done = done || ++m_BitsWritten >= m_NumBits;
+    }
+
+    int m_BitsWritten;
+    const int m_NumBits;
+    unsigned char* m_CurByte;
+    int m_NextBit;
+    int m_BitsRead;
+
+    bool done;
+};
+
+template <typename IntType>
+class Bits {
+private:
+    const IntType& m_Bits;
+
+    // Don't copy
+    Bits() {}
+    Bits(const Bits&) {}
+    Bits& operator=(const Bits&) {}
+
+public:
+    explicit Bits(IntType& v) : m_Bits(v) {}
+
+    uint8_t operator[](uint32_t bitPos) {
+        return static_cast<uint8_t>((m_Bits >> bitPos) & 1);
+    }
+
+    IntType operator()(uint32_t start, uint32_t end) {
+        if (start == end) {
+            return (*this)[start];
+        } else if (start > end) {
+            uint32_t t = start;
+            start = end;
+            end = t;
+        }
+
+        uint64_t mask = (1 << (end - start + 1)) - 1;
+        return (m_Bits >> start) & mask;
+    }
+};
+
+enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit };
+
+class IntegerEncodedValue {
+private:
+    const EIntegerEncoding m_Encoding;
+    const uint32_t m_NumBits;
+    uint32_t m_BitValue;
+    union {
+        uint32_t m_QuintValue;
+        uint32_t m_TritValue;
+    };
+
+public:
+    // Jank, but we're not doing any heavy lifting in this class, so it's
+    // probably OK. It allows us to use these in std::vectors...
+    IntegerEncodedValue& operator=(const IntegerEncodedValue& other) {
+        new (this) IntegerEncodedValue(other);
+        return *this;
+    }
+
+    IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits)
+        : m_Encoding(encoding), m_NumBits(numBits) {}
+
+    EIntegerEncoding GetEncoding() const {
+        return m_Encoding;
+    }
+    uint32_t BaseBitLength() const {
+        return m_NumBits;
+    }
+
+    uint32_t GetBitValue() const {
+        return m_BitValue;
+    }
+    void SetBitValue(uint32_t val) {
+        m_BitValue = val;
+    }
+
+    uint32_t GetTritValue() const {
+        return m_TritValue;
+    }
+    void SetTritValue(uint32_t val) {
+        m_TritValue = val;
+    }
+
+    uint32_t GetQuintValue() const {
+        return m_QuintValue;
+    }
+    void SetQuintValue(uint32_t val) {
+        m_QuintValue = val;
+    }
+
+    bool MatchesEncoding(const IntegerEncodedValue& other) {
+        return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits;
+    }
+
+    // Returns the number of bits required to encode nVals values.
+    uint32_t GetBitLength(uint32_t nVals) {
+        uint32_t totalBits = m_NumBits * nVals;
+        if (m_Encoding == eIntegerEncoding_Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (m_Encoding == eIntegerEncoding_Quint) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+
+    // Count the number of bits set in a number.
+    static inline uint32_t Popcnt(uint32_t n) {
+        uint32_t c;
+        for (c = 0; n; c++) {
+            n &= n - 1;
+        }
+        return c;
+    }
+
+    // Returns a new instance of this struct that corresponds to the
+    // can take no more than maxval values
+    static IntegerEncodedValue CreateEncoding(uint32_t maxVal) {
+        while (maxVal > 0) {
+            uint32_t check = maxVal + 1;
+
+            // Is maxVal a power of two?
+            if (!(check & (check - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal));
+            }
+
+            // Is maxVal of the type 3*2^n - 1?
+            if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1));
+            }
+
+            // Is maxVal of the type 5*2^n - 1?
+            if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1));
+            }
+
+            // Apparently it can't be represented with a bounded integer sequence...
+            // just iterate.
+            maxVal--;
+        }
+        return IntegerEncodedValue(eIntegerEncoding_JustBits, 0);
+    }
+
+    // Fills result with the values that are encoded in the given
+    // bitstream. We must know beforehand what the maximum possible
+    // value is, and how many values we're decoding.
+    static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, BitStream& bits,
+                                      uint32_t maxRange, uint32_t nValues) {
+        // Determine encoding parameters
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange);
+
+        // Start decoding
+        uint32_t nValsDecoded = 0;
+        while (nValsDecoded < nValues) {
+            switch (val.GetEncoding()) {
+            case eIntegerEncoding_Quint:
+                DecodeQuintBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 3;
+                break;
+
+            case eIntegerEncoding_Trit:
+                DecodeTritBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 5;
+                break;
+
+            case eIntegerEncoding_JustBits:
+                val.SetBitValue(bits.ReadBits(val.BaseBitLength()));
+                result.push_back(val);
+                nValsDecoded++;
+                break;
+            }
+        }
+    }
+
+private:
+    static void DecodeTritBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[5];
+        uint32_t t[5];
+        uint32_t T;
+
+        // Read the trit encoded block according to
+        // table C.2.14
+        m[0] = bits.ReadBits(nBitsPerValue);
+        T = bits.ReadBits(2);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 2;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 4;
+        m[3] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 5;
+        m[4] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 7;
+
+        uint32_t C = 0;
+
+        Bits<uint32_t> Tb(T);
+        if (Tb(2, 4) == 7) {
+            C = (Tb(5, 7) << 2) | Tb(0, 1);
+            t[4] = t[3] = 2;
+        } else {
+            C = Tb(0, 4);
+            if (Tb(5, 6) == 3) {
+                t[4] = 2;
+                t[3] = Tb[7];
+            } else {
+                t[4] = Tb[7];
+                t[3] = Tb(5, 6);
+            }
+        }
+
+        Bits<uint32_t> Cb(C);
+        if (Cb(0, 1) == 3) {
+            t[2] = 2;
+            t[1] = Cb[4];
+            t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
+        } else if (Cb(2, 3) == 3) {
+            t[2] = 2;
+            t[1] = 2;
+            t[0] = Cb(0, 1);
+        } else {
+            t[2] = Cb[4];
+            t[1] = Cb(2, 3);
+            t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
+        }
+
+        for (uint32_t i = 0; i < 5; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue);
+            val.SetBitValue(m[i]);
+            val.SetTritValue(t[i]);
+            result.push_back(val);
+        }
+    }
+
+    static void DecodeQuintBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                 uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[3];
+        uint32_t q[3];
+        uint32_t Q;
+
+        // Read the trit encoded block according to
+        // table C.2.15
+        m[0] = bits.ReadBits(nBitsPerValue);
+        Q = bits.ReadBits(3);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 3;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 5;
+
+        Bits<uint32_t> Qb(Q);
+        if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
+            q[0] = q[1] = 4;
+            q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
+        } else {
+            uint32_t C = 0;
+            if (Qb(1, 2) == 3) {
+                q[2] = 4;
+                C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
+            } else {
+                q[2] = Qb(5, 6);
+                C = Qb(0, 4);
+            }
+
+            Bits<uint32_t> Cb(C);
+            if (Cb(0, 2) == 5) {
+                q[1] = 4;
+                q[0] = Cb(3, 4);
+            } else {
+                q[1] = Cb(3, 4);
+                q[0] = Cb(0, 2);
+            }
+        }
+
+        for (uint32_t i = 0; i < 3; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue);
+            val.m_BitValue = m[i];
+            val.m_QuintValue = q[i];
+            result.push_back(val);
+        }
+    }
+};
+
+namespace ASTCC {
+
+struct TexelWeightParams {
+    uint32_t m_Width;
+    uint32_t m_Height;
+    bool m_bDualPlane;
+    uint32_t m_MaxWeight;
+    bool m_bError;
+    bool m_bVoidExtentLDR;
+    bool m_bVoidExtentHDR;
+
+    TexelWeightParams() {
+        memset(this, 0, sizeof(*this));
+    }
+
+    uint32_t GetPackedBitSize() {
+        // How many indices do we have?
+        uint32_t nIdxs = m_Height * m_Width;
+        if (m_bDualPlane) {
+            nIdxs *= 2;
+        }
+
+        return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs);
+    }
+
+    uint32_t GetNumWeightValues() const {
+        uint32_t ret = m_Width * m_Height;
+        if (m_bDualPlane) {
+            ret *= 2;
+        }
+        return ret;
+    }
+};
+
+TexelWeightParams DecodeBlockInfo(BitStream& strm) {
+    TexelWeightParams params;
+
+    // Read the entire block mode all at once
+    uint16_t modeBits = strm.ReadBits(11);
+
+    // Does this match the void extent block mode?
+    if ((modeBits & 0x01FF) == 0x1FC) {
+        if (modeBits & 0x200) {
+            params.m_bVoidExtentHDR = true;
+        } else {
+            params.m_bVoidExtentLDR = true;
+        }
+
+        // Next two bits must be one.
+        if (!(modeBits & 0x400) || !strm.ReadBit()) {
+            params.m_bError = true;
+        }
+
+        return params;
+    }
+
+    // First check if the last four bits are zero
+    if ((modeBits & 0xF) == 0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // If the last two bits are zero, then if bits
+    // [6-8] are all ones, this is also reserved.
+    if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // Otherwise, there is no error... Figure out the layout
+    // of the block mode. Layout is determined by a number
+    // between 0 and 9 corresponding to table C.2.8 of the
+    // ASTC spec.
+    uint32_t layout = 0;
+
+    if ((modeBits & 0x1) || (modeBits & 0x2)) {
+        // layout is in [0-4]
+        if (modeBits & 0x8) {
+            // layout is in [2-4]
+            if (modeBits & 0x4) {
+                // layout is in [3-4]
+                if (modeBits & 0x100) {
+                    layout = 4;
+                } else {
+                    layout = 3;
+                }
+            } else {
+                layout = 2;
+            }
+        } else {
+            // layout is in [0-1]
+            if (modeBits & 0x4) {
+                layout = 1;
+            } else {
+                layout = 0;
+            }
+        }
+    } else {
+        // layout is in [5-9]
+        if (modeBits & 0x100) {
+            // layout is in [7-9]
+            if (modeBits & 0x80) {
+                // layout is in [7-8]
+                assert((modeBits & 0x40) == 0U);
+                if (modeBits & 0x20) {
+                    layout = 8;
+                } else {
+                    layout = 7;
+                }
+            } else {
+                layout = 9;
+            }
+        } else {
+            // layout is in [5-6]
+            if (modeBits & 0x80) {
+                layout = 6;
+            } else {
+                layout = 5;
+            }
+        }
+    }
+
+    assert(layout < 10);
+
+    // Determine R
+    uint32_t R = !!(modeBits & 0x10);
+    if (layout < 5) {
+        R |= (modeBits & 0x3) << 1;
+    } else {
+        R |= (modeBits & 0xC) >> 1;
+    }
+    assert(2 <= R && R <= 7);
+
+    // Determine width & height
+    switch (layout) {
+    case 0: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 4;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 1: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 8;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 2: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = B + 8;
+        break;
+    }
+
+    case 3: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = A + 2;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    case 4: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = B + 2;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 5: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = 12;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 6: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = 12;
+        break;
+    }
+
+    case 7: {
+        params.m_Width = 6;
+        params.m_Height = 10;
+        break;
+    }
+
+    case 8: {
+        params.m_Width = 10;
+        params.m_Height = 6;
+        break;
+    }
+
+    case 9: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 9) & 0x3;
+        params.m_Width = A + 6;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    default:
+        assert(!"Don't know this layout...");
+        params.m_bError = true;
+        break;
+    }
+
+    // Determine whether or not we're using dual planes
+    // and/or high precision layouts.
+    bool D = (layout != 9) && (modeBits & 0x400);
+    bool H = (layout != 9) && (modeBits & 0x200);
+
+    if (H) {
+        const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31};
+        params.m_MaxWeight = maxWeights[R - 2];
+    } else {
+        const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7};
+        params.m_MaxWeight = maxWeights[R - 2];
+    }
+
+    params.m_bDualPlane = D;
+
+    return params;
+}
+
+void FillVoidExtentLDR(BitStream& strm, uint32_t* const outBuf, uint32_t blockWidth,
+                       uint32_t blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (int i = 0; i < 4; ++i) {
+        strm.ReadBits(13);
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    uint16_t r = strm.ReadBits(16);
+    uint16_t g = strm.ReadBits(16);
+    uint16_t b = strm.ReadBits(16);
+    uint16_t a = strm.ReadBits(16);
+
+    uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 |
+                    (static_cast<uint32_t>(a) & 0xFF00) << 16;
+
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+}
+
+void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) {
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+}
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) {
+    if (numBits == 0)
+        return 0;
+    if (toBit == 0)
+        return 0;
+    IntType v = val & ((1 << numBits) - 1);
+    IntType res = v;
+    uint32_t reslen = numBits;
+    while (reslen < toBit) {
+        uint32_t comp = 0;
+        if (numBits > toBit - reslen) {
+            uint32_t newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res <<= numBits;
+        res |= v >> comp;
+        reslen += numBits;
+    }
+    return res;
+}
+
+class Pixel {
+protected:
+    typedef int16_t ChannelType;
+    uint8_t m_BitDepth[4];
+    int16_t color[4];
+
+public:
+    Pixel() {
+        for (int i = 0; i < 4; i++) {
+            m_BitDepth[i] = 8;
+            color[i] = 0;
+        }
+    }
+
+    Pixel(ChannelType a, ChannelType r, ChannelType g, ChannelType b, unsigned bitDepth = 8) {
+        for (int i = 0; i < 4; i++)
+            m_BitDepth[i] = bitDepth;
+
+        color[0] = a;
+        color[1] = r;
+        color[2] = g;
+        color[3] = b;
+    }
+
+    // Changes the depth of each pixel. This scales the values to
+    // the appropriate bit depth by either truncating the least
+    // significant bits when going from larger to smaller bit depth
+    // or by repeating the most significant bits when going from
+    // smaller to larger bit depths.
+    void ChangeBitDepth(const uint8_t (&depth)[4]) {
+        for (uint32_t i = 0; i < 4; i++) {
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
+            m_BitDepth[i] = depth[i];
+        }
+    }
+
+    template <typename IntType>
+    static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) {
+        float denominator = static_cast<float>((1 << bitDepth) - 1);
+        return static_cast<float>(channel) / denominator;
+    }
+
+    // Changes the bit depth of a single component. See the comment
+    // above for how we do this.
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) {
+        assert(newDepth <= 8);
+        assert(oldDepth <= 8);
+
+        if (oldDepth == newDepth) {
+            // Do nothing
+            return val;
+        } else if (oldDepth == 0 && newDepth != 0) {
+            return (1 << newDepth) - 1;
+        } else if (newDepth > oldDepth) {
+            return Replicate(val, oldDepth, newDepth);
+        } else {
+            // oldDepth > newDepth
+            if (newDepth == 0) {
+                return 0xFF;
+            } else {
+                uint8_t bitsWasted = oldDepth - newDepth;
+                uint16_t v = static_cast<uint16_t>(val);
+                v = (v + (1 << (bitsWasted - 1))) >> bitsWasted;
+                v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), (1 << newDepth) - 1);
+                return static_cast<uint8_t>(v);
+            }
+        }
+
+        assert(!"We shouldn't get here.");
+        return 0;
+    }
+
+    const ChannelType& A() const {
+        return color[0];
+    }
+    ChannelType& A() {
+        return color[0];
+    }
+    const ChannelType& R() const {
+        return color[1];
+    }
+    ChannelType& R() {
+        return color[1];
+    }
+    const ChannelType& G() const {
+        return color[2];
+    }
+    ChannelType& G() {
+        return color[2];
+    }
+    const ChannelType& B() const {
+        return color[3];
+    }
+    ChannelType& B() {
+        return color[3];
+    }
+    const ChannelType& Component(uint32_t idx) const {
+        return color[idx];
+    }
+    ChannelType& Component(uint32_t idx) {
+        return color[idx];
+    }
+
+    void GetBitDepth(uint8_t (&outDepth)[4]) const {
+        for (int i = 0; i < 4; i++) {
+            outDepth[i] = m_BitDepth[i];
+        }
+    }
+
+    // Take all of the components, transform them to their 8-bit variants,
+    // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
+    // that the architecture is little-endian, so the alpha channel will end
+    // up in the most-significant byte.
+    uint32_t Pack() const {
+        Pixel eightBit(*this);
+        const uint8_t eightBitDepth[4] = {8, 8, 8, 8};
+        eightBit.ChangeBitDepth(eightBitDepth);
+
+        uint32_t r = 0;
+        r |= eightBit.A();
+        r <<= 8;
+        r |= eightBit.B();
+        r <<= 8;
+        r |= eightBit.G();
+        r <<= 8;
+        r |= eightBit.R();
+        return r;
+    }
+
+    // Clamps the pixel to the range [0,255]
+    void ClampByte() {
+        for (uint32_t i = 0; i < 4; i++) {
+            color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+        }
+    }
+
+    void MakeOpaque() {
+        A() = 255;
+    }
+};
+
+void DecodeColorValues(uint32_t* out, uint8_t* data, uint32_t* modes, const uint32_t nPartitions,
+                       const uint32_t nBitsForColorData) {
+    // First figure out how many color values we have
+    uint32_t nValues = 0;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        nValues += ((modes[i] >> 2) + 1) << 1;
+    }
+
+    // Then based on the number of values and the remaining number of bits,
+    // figure out the max value for each of them...
+    uint32_t range = 256;
+    while (--range > 0) {
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range);
+        uint32_t bitLength = val.GetBitLength(nValues);
+        if (bitLength <= nBitsForColorData) {
+            // Find the smallest possible range that matches the given encoding
+            while (--range > 0) {
+                IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range);
+                if (!newval.MatchesEncoding(val)) {
+                    break;
+                }
+            }
+
+            // Return to last matching range.
+            range++;
+            break;
+        }
+    }
+
+    // We now have enough to decode our integer sequence.
+    std::vector<IntegerEncodedValue> decodedColorValues;
+    BitStream colorStream(data);
+    IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
+
+    // Once we have the decoded values, we need to dequantize them to the 0-255 range
+    // This procedure is outlined in ASTC spec C.2.13
+    uint32_t outIdx = 0;
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = decodedColorValues.begin(); itr != decodedColorValues.end(); itr++) {
+        // Have we already decoded all that we need?
+        if (outIdx >= nValues) {
+            break;
+        }
+
+        const IntegerEncodedValue& val = *itr;
+        uint32_t bitlen = val.BaseBitLength();
+        uint32_t bitval = val.GetBitValue();
+
+        assert(bitlen >= 1);
+
+        uint32_t A = 0, B = 0, C = 0, D = 0;
+        // A is just the lsb replicated 9 times.
+        A = Replicate(bitval & 1, 1, 9);
+
+        switch (val.GetEncoding()) {
+        // Replicate bits
+        case eIntegerEncoding_JustBits:
+            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            break;
+
+        // Use algorithm in C.2.13
+        case eIntegerEncoding_Trit: {
+
+            D = val.GetTritValue();
+
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+
+            case 2: {
+                C = 93;
+                // B = b000b0bb0
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                // B = cb000cbcb
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                // B = dcb000dcb
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                // B = edcb000ed
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                // B = fedcb000f
+                uint32_t fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+
+            default:
+                assert(!"Unsupported trit encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Trit
+        break;
+
+        case eIntegerEncoding_Quint: {
+
+            D = val.GetQuintValue();
+
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+
+            case 2: {
+                C = 54;
+                // B = b0000bb00
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+
+            case 3: {
+                C = 26;
+                // B = cb0000cbc
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+
+            case 4: {
+                C = 13;
+                // B = dcb0000dc
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+
+            case 5: {
+                C = 6;
+                // B = edcb0000e
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+
+            default:
+                assert(!"Unsupported quint encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Quint
+        break;
+        } // switch(val.GetEncoding())
+
+        if (val.GetEncoding() != eIntegerEncoding_JustBits) {
+            uint32_t T = D * C + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            out[outIdx++] = T;
+        }
+    }
+
+    // Make sure that each of our values is in the proper range...
+    for (uint32_t i = 0; i < nValues; i++) {
+        assert(out[i] <= 255);
+    }
+}
+
+uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
+    uint32_t bitval = val.GetBitValue();
+    uint32_t bitlen = val.BaseBitLength();
+
+    uint32_t A = Replicate(bitval & 1, 1, 7);
+    uint32_t B = 0, C = 0, D = 0;
+
+    uint32_t result = 0;
+    switch (val.GetEncoding()) {
+    case eIntegerEncoding_JustBits:
+        result = Replicate(bitval, bitlen, 6);
+        break;
+
+    case eIntegerEncoding_Trit: {
+        D = val.GetTritValue();
+        assert(D < 3);
+
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 50;
+        } break;
+
+        case 2: {
+            C = 23;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+
+        case 3: {
+            C = 11;
+            uint32_t cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+
+        default:
+            assert(!"Invalid trit encoding for texel weight");
+            break;
+        }
+    } break;
+
+    case eIntegerEncoding_Quint: {
+        D = val.GetQuintValue();
+        assert(D < 5);
+
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 28;
+        } break;
+
+        case 2: {
+            C = 13;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+
+        default:
+            assert(!"Invalid quint encoding for texel weight");
+            break;
+        }
+    } break;
+    }
+
+    if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) {
+        // Decode the value...
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+
+    assert(result < 64);
+
+    // Change from [0,63] to [0,64]
+    if (result > 32) {
+        result += 1;
+    }
+
+    return result;
+}
+
+void UnquantizeTexelWeights(uint32_t out[2][144], std::vector<IntegerEncodedValue>& weights,
+                            const TexelWeightParams& params, const uint32_t blockWidth,
+                            const uint32_t blockHeight) {
+    uint32_t weightIdx = 0;
+    uint32_t unquantized[2][144];
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = weights.begin(); itr != weights.end(); itr++) {
+        unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
+
+        if (params.m_bDualPlane) {
+            itr++;
+            unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
+            if (itr == weights.end()) {
+                break;
+            }
+        }
+
+        if (++weightIdx >= (params.m_Width * params.m_Height))
+            break;
+    }
+
+    // Do infill if necessary (Section C.2.18) ...
+    uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
+    uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
+
+    const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U;
+    for (uint32_t plane = 0; plane < kPlaneScale; plane++)
+        for (uint32_t t = 0; t < blockHeight; t++)
+            for (uint32_t s = 0; s < blockWidth; s++) {
+                uint32_t cs = Ds * s;
+                uint32_t ct = Dt * t;
+
+                uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6;
+                uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6;
+
+                uint32_t js = gs >> 4;
+                uint32_t fs = gs & 0xF;
+
+                uint32_t jt = gt >> 4;
+                uint32_t ft = gt & 0x0F;
+
+                uint32_t w11 = (fs * ft + 8) >> 4;
+                uint32_t w10 = ft - w11;
+                uint32_t w01 = fs - w11;
+                uint32_t w00 = 16 - fs - ft + w11;
+
+                uint32_t v0 = js + jt * params.m_Width;
+
+#define FIND_TEXEL(tidx, bidx)                                                                     \
+    uint32_t p##bidx = 0;                                                                          \
+    do {                                                                                           \
+        if ((tidx) < (params.m_Width * params.m_Height)) {                                         \
+            p##bidx = unquantized[plane][(tidx)];                                                  \
+        }                                                                                          \
+    } while (0)
+
+                FIND_TEXEL(v0, 00);
+                FIND_TEXEL(v0 + 1, 01);
+                FIND_TEXEL(v0 + params.m_Width, 10);
+                FIND_TEXEL(v0 + params.m_Width + 1, 11);
+
+#undef FIND_TEXEL
+
+                out[plane][t * blockWidth + s] =
+                    (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
+            }
+}
+
+// Transfers a bit as described in C.2.14
+static inline void BitTransferSigned(int32_t& a, int32_t& b) {
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3F;
+    if (a & 0x20)
+        a -= 0x40;
+}
+
+// Adds more precision to the blue channel as described
+// in C.2.14
+static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) {
+    return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1),
+                 static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b));
+}
+
+// Partition selection functions as specified in
+// C.2.21
+static inline uint32_t hash52(uint32_t p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
+                                int32_t partitionCount, int32_t smallBlock) {
+    if (1 == partitionCount)
+        return 0;
+
+    if (smallBlock) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partitionCount - 1) * 1024;
+
+    uint32_t rnum = hash52(static_cast<uint32_t>(seed));
+    uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF);
+    uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF);
+    uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF);
+    uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF);
+    uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF);
+    uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF);
+    uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF);
+    uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF);
+    uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF);
+    uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF);
+    uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF);
+    uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 *= seed1;
+    seed2 *= seed2;
+    seed3 *= seed3;
+    seed4 *= seed4;
+    seed5 *= seed5;
+    seed6 *= seed6;
+    seed7 *= seed7;
+    seed8 *= seed8;
+    seed9 *= seed9;
+    seed10 *= seed10;
+    seed11 *= seed11;
+    seed12 *= seed12;
+
+    int32_t sh1, sh2, sh3;
+    if (seed & 1) {
+        sh1 = (seed & 2) ? 4 : 5;
+        sh2 = (partitionCount == 3) ? 6 : 5;
+    } else {
+        sh1 = (partitionCount == 3) ? 6 : 5;
+        sh2 = (seed & 2) ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) ? sh1 : sh2;
+
+    seed1 >>= sh1;
+    seed2 >>= sh2;
+    seed3 >>= sh1;
+    seed4 >>= sh2;
+    seed5 >>= sh1;
+    seed6 >>= sh2;
+    seed7 >>= sh1;
+    seed8 >>= sh2;
+    seed9 >>= sh3;
+    seed10 >>= sh3;
+    seed11 >>= sh3;
+    seed12 >>= sh3;
+
+    int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partitionCount < 4)
+        d = 0;
+    if (partitionCount < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount,
+                                         int32_t smallBlock) {
+    return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
+}
+
+// Section C.2.14
+void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues,
+                      uint32_t colorEndpointMode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint32_t v[N];                                                                                 \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = *(colorValues++);                                                                   \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    int32_t v[N];                                                                                  \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = static_cast<int32_t>(*(colorValues++));                                             \
+    }
+
+    switch (colorEndpointMode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = Pixel(0xFF, v[0], v[0], v[0]);
+        ep2 = Pixel(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = Pixel(0xFF, L0, L0, L0);
+        ep2 = Pixel(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(v[7], v[1], v[3], v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        BitTransferSigned(v[7], v[6]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    default:
+        assert(!"Unsupported color endpoint mode (is it HDR?)");
+        break;
+    }
+
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth, const uint32_t blockHeight,
+                     uint32_t* outBuf) {
+    BitStream strm(inBuf);
+    TexelWeightParams weightParams = DecodeBlockInfo(strm);
+
+    // Was there an error?
+    if (weightParams.m_bError) {
+        assert(!"Invalid block mode");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentLDR) {
+        FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentHDR) {
+        assert(!"HDR void extent blocks are unsupported!");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Width > blockWidth) {
+        assert(!"Texel weight grid width should be smaller than block width");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Height > blockHeight) {
+        assert(!"Texel weight grid height should be smaller than block height");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Read num partitions
+    uint32_t nPartitions = strm.ReadBits(2) + 1;
+    assert(nPartitions <= 4);
+
+    if (nPartitions == 4 && weightParams.m_bDualPlane) {
+        assert(!"Dual plane mode is incompatible with four partition blocks");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Based on the number of partitions, read the color endpoint mode for
+    // each partition.
+
+    // Determine partitions, partition index, and color endpoint modes
+    int32_t planeIdx = -1;
+    uint32_t partitionIndex;
+    uint32_t colorEndpointMode[4] = {0, 0, 0, 0};
+
+    // Define color data.
+    uint8_t colorEndpointData[16];
+    memset(colorEndpointData, 0, sizeof(colorEndpointData));
+    BitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
+
+    // Read extra config data...
+    uint32_t baseCEM = 0;
+    if (nPartitions == 1) {
+        colorEndpointMode[0] = strm.ReadBits(4);
+        partitionIndex = 0;
+    } else {
+        partitionIndex = strm.ReadBits(10);
+        baseCEM = strm.ReadBits(6);
+    }
+    uint32_t baseMode = (baseCEM & 3);
+
+    // Remaining bits are color endpoint data...
+    uint32_t nWeightBits = weightParams.GetPackedBitSize();
+    int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead();
+
+    // Consider extra bits prior to texel data...
+    uint32_t extraCEMbits = 0;
+    if (baseMode) {
+        switch (nPartitions) {
+        case 2:
+            extraCEMbits += 2;
+            break;
+        case 3:
+            extraCEMbits += 5;
+            break;
+        case 4:
+            extraCEMbits += 8;
+            break;
+        default:
+            assert(false);
+            break;
+        }
+    }
+    remainingBits -= extraCEMbits;
+
+    // Do we have a dual plane situation?
+    uint32_t planeSelectorBits = 0;
+    if (weightParams.m_bDualPlane) {
+        planeSelectorBits = 2;
+    }
+    remainingBits -= planeSelectorBits;
+
+    // Read color data...
+    uint32_t colorDataBits = remainingBits;
+    while (remainingBits > 0) {
+        uint32_t nb = std::min(remainingBits, 8);
+        uint32_t b = strm.ReadBits(nb);
+        colorEndpointStream.WriteBits(b, nb);
+        remainingBits -= 8;
+    }
+
+    // Read the plane selection bits
+    planeIdx = strm.ReadBits(planeSelectorBits);
+
+    // Read the rest of the CEM
+    if (baseMode) {
+        uint32_t extraCEM = strm.ReadBits(extraCEMbits);
+        uint32_t CEM = (extraCEM << 6) | baseCEM;
+        CEM >>= 2;
+
+        bool C[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            C[i] = CEM & 1;
+            CEM >>= 1;
+        }
+
+        uint8_t M[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            M[i] = CEM & 3;
+            CEM >>= 2;
+            assert(M[i] <= 3);
+        }
+
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = baseMode;
+            if (!(C[i]))
+                colorEndpointMode[i] -= 1;
+            colorEndpointMode[i] <<= 2;
+            colorEndpointMode[i] |= M[i];
+        }
+    } else if (nPartitions > 1) {
+        uint32_t CEM = baseCEM >> 2;
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = CEM;
+        }
+    }
+
+    // Make sure everything up till here is sane.
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        assert(colorEndpointMode[i] < 16);
+    }
+    assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
+
+    // Decode both color data and texel weight data
+    uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
+                      colorDataBits);
+
+    Pixel endpoints[4][2];
+    const uint32_t* colorValuesPtr = colorValues;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
+    }
+
+    // Read the texel weight data..
+    uint8_t texelWeightData[16];
+    memcpy(texelWeightData, inBuf, sizeof(texelWeightData));
+
+    // Reverse everything
+    for (uint32_t i = 0; i < 8; i++) {
+// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
+        unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i]));
+        unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i]));
+#undef REVERSE_BYTE
+
+        texelWeightData[i] = b;
+        texelWeightData[15 - i] = a;
+    }
+
+    // Make sure that higher non-texel bits are set to zero
+    const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
+    texelWeightData[clearByteStart - 1] &= (1 << (weightParams.GetPackedBitSize() % 8)) - 1;
+    memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
+
+    std::vector<IntegerEncodedValue> texelWeightValues;
+    BitStream weightStream(texelWeightData);
+
+    IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream,
+                                               weightParams.m_MaxWeight,
+                                               weightParams.GetNumWeightValues());
+
+    // Blocks can be at most 12x12, so we can have as many as 144 weights
+    uint32_t weights[2][144];
+    UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
+
+    // Now that we have endpoints and weights, we can interpolate and generate
+    // the proper decoding...
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions,
+                                                   (blockHeight * blockWidth) < 32);
+            assert(partition < nPartitions);
+
+            Pixel p;
+            for (uint32_t c = 0; c < 4; c++) {
+                uint32_t C0 = endpoints[partition][0].Component(c);
+                C0 = Replicate(C0, 8, 16);
+                uint32_t C1 = endpoints[partition][1].Component(c);
+                C1 = Replicate(C1, 8, 16);
+
+                uint32_t plane = 0;
+                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
+                    plane = 1;
+                }
+
+                uint32_t weight = weights[plane][j * blockWidth + i];
+                uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
+                if (C == 65535) {
+                    p.Component(c) = 255;
+                } else {
+                    double Cf = static_cast<double>(C);
+                    p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5);
+                }
+            }
+
+            outBuf[j * blockWidth + i] = p.Pack();
+        }
+}
+
+} // namespace ASTCC
+
+namespace Tegra::Texture::ASTC {
+
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height) {
+    uint32_t blockIdx = 0;
+    std::vector<uint8_t> outData;
+    outData.resize(height * width * 4);
+    for (uint32_t j = 0; j < height; j += block_height) {
+        for (uint32_t i = 0; i < width; i += block_width) {
+
+            uint8_t* blockPtr = data.data() + blockIdx * 16;
+
+            // Blocks can be at most 12x12
+            uint32_t uncompData[144];
+            ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData);
+
+            uint32_t decompWidth = std::min(block_width, width - i);
+            uint32_t decompHeight = std::min(block_height, height - j);
+
+            uint8_t* outRow = outData.data() + (j * width + i) * 4;
+            for (uint32_t jj = 0; jj < decompHeight; jj++) {
+                memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
+            }
+
+            blockIdx++;
+        }
+    }
+
+    return outData;
+}
+
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
new file mode 100644
index 000000000..f0d7c0e56
--- /dev/null
+++ b/src/video_core/textures/astc.h
@@ -0,0 +1,15 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace Tegra::Texture::ASTC {
+
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height);
+
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 2d2af5554..0db4367f1 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -53,8 +53,10 @@ u32 BytesPerPixel(TextureFormat format) {
     case TextureFormat::DXT45:
         // In this case a 'pixel' actually refers to a 4x4 tile.
         return 16;
+    case TextureFormat::ASTC_2D_4X4:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
+    case TextureFormat::BF10GF11RF11:
         return 4;
     case TextureFormat::A1B5G5R5:
     case TextureFormat::B5G6R5:
@@ -92,6 +94,8 @@ std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width,
     case TextureFormat::B5G6R5:
     case TextureFormat::R8:
     case TextureFormat::R16_G16_B16_A16:
+    case TextureFormat::BF10GF11RF11:
+    case TextureFormat::ASTC_2D_4X4:
         CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
                          unswizzled_data.data(), true, block_height);
         break;
@@ -113,11 +117,13 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
     case TextureFormat::DXN1:
+    case TextureFormat::ASTC_2D_4X4:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A1B5G5R5:
     case TextureFormat::B5G6R5:
     case TextureFormat::R8:
+    case TextureFormat::BF10GF11RF11:
         // TODO(Subv): For the time being just forward the same data without any decoding.
         rgba_data = texture_data;
         break;
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index f48ca30b8..a17eaf19d 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -122,6 +122,17 @@ enum class ComponentType : u32 {
     FLOAT = 7
 };
 
+enum class SwizzleSource : u32 {
+    Zero = 0,
+
+    R = 2,
+    G = 3,
+    B = 4,
+    A = 5,
+    OneInt = 6,
+    OneFloat = 7,
+};
+
 union TextureHandle {
     u32 raw;
     BitField<0, 20, u32> tic_id;
@@ -139,6 +150,11 @@ struct TICEntry {
         BitField<10, 3, ComponentType> g_type;
         BitField<13, 3, ComponentType> b_type;
         BitField<16, 3, ComponentType> a_type;
+
+        BitField<19, 3, SwizzleSource> x_source;
+        BitField<22, 3, SwizzleSource> y_source;
+        BitField<25, 3, SwizzleSource> z_source;
+        BitField<28, 3, SwizzleSource> w_source;
     };
     u32 address_low;
     union {