13 files changed, 737 insertions, 345 deletions
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0a33868b7..3ba20f978 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -196,8 +196,10 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     auto& gpu = Core::System().GetInstance().GPU().Maxwell3D();
     ASSERT_MSG(!gpu.regs.shader_config[0].enable, "VertexA is unsupported!");
 
-    // Next available bindpoint to use when uploading the const buffers to the GLSL shaders.
+    // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
+    // shaders.
     u32 current_constbuffer_bindpoint = 0;
+    u32 current_texture_bindpoint = 0;
 
     for (unsigned index = 1; index < Maxwell::MaxShaderProgram; ++index) {
         auto& shader_config = gpu.regs.shader_config[index];
@@ -212,13 +214,17 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             continue;
         }
 
+        GLShader::MaxwellUniformData ubo{};
+        ubo.SetFromRegs(gpu.state.shader_stages[stage]);
+        std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
+
+        // Flush the buffer so that the GPU can see the data we just wrote.
+        glFlushMappedBufferRange(GL_ARRAY_BUFFER, buffer_offset, sizeof(ubo));
+
         // Upload uniform data as one UBO per stage
         const GLintptr ubo_offset = buffer_offset;
         copy_buffer(uniform_buffers[stage].handle, ubo_offset,
                     sizeof(GLShader::MaxwellUniformData));
-        GLShader::MaxwellUniformData* ub_ptr =
-            reinterpret_cast<GLShader::MaxwellUniformData*>(buffer_ptr);
-        ub_ptr->SetFromRegs(gpu.state.shader_stages[stage]);
 
         buffer_ptr += sizeof(GLShader::MaxwellUniformData);
         buffer_offset += sizeof(GLShader::MaxwellUniformData);
@@ -258,6 +264,11 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
         current_constbuffer_bindpoint =
             SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
                               current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
+
+        // Configure the textures for this shader stage.
+        current_texture_bindpoint =
+            SetupTextures(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
+                          current_texture_bindpoint, shader_resources.texture_samplers);
     }
 
     shader_program_manager->UseTrivialGeometryShader();
@@ -338,12 +349,12 @@ void RasterizerOpenGL::DrawArrays() {
     // Sync the viewport
     SyncViewport(surfaces_rect, res_scale);
 
+    // Sync the blend state registers
+    SyncBlendState();
+
     // TODO(bunnei): Sync framebuffer_scale uniform here
     // TODO(bunnei): Sync scissorbox uniform(s) here
 
-    // Sync and bind the texture surfaces
-    BindTextures();
-
     // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable
     // scissor test to prevent drawing outside of the framebuffer region
     state.scissor.enabled = true;
@@ -447,65 +458,7 @@ void RasterizerOpenGL::DrawArrays() {
     }
 }
 
-void RasterizerOpenGL::BindTextures() {
-    using Regs = Tegra::Engines::Maxwell3D::Regs;
-    auto& maxwell3d = Core::System::GetInstance().GPU().Get3DEngine();
-
-    // Each Maxwell shader stage can have an arbitrary number of textures, but we're limited to a
-    // certain number in OpenGL. We try to only use the minimum amount of host textures by not
-    // keeping a 1:1 relation between guest texture ids and host texture ids, ie, guest texture id 8
-    // can be host texture id 0 if it's the only texture used in the guest shader program.
-    u32 host_texture_index = 0;
-    for (u32 stage = 0; stage < Regs::MaxShaderStage; ++stage) {
-        ASSERT(host_texture_index < texture_samplers.size());
-        const auto textures = maxwell3d.GetStageTextures(static_cast<Regs::ShaderStage>(stage));
-        for (unsigned texture_index = 0; texture_index < textures.size(); ++texture_index) {
-            const auto& texture = textures[texture_index];
-
-            if (texture.enabled) {
-                texture_samplers[host_texture_index].SyncWithConfig(texture.tsc);
-                Surface surface = res_cache.GetTextureSurface(texture);
-                if (surface != nullptr) {
-                    state.texture_units[host_texture_index].texture_2d = surface->texture.handle;
-                } else {
-                    // Can occur when texture addr is null or its memory is unmapped/invalid
-                    state.texture_units[texture_index].texture_2d = 0;
-                }
-
-                ++host_texture_index;
-            } else {
-                state.texture_units[texture_index].texture_2d = 0;
-            }
-        }
-    }
-}
-
-void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {
-    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-    switch (method) {
-    case MAXWELL3D_REG_INDEX(blend.separate_alpha):
-        ASSERT_MSG(false, "unimplemented");
-        break;
-    case MAXWELL3D_REG_INDEX(blend.equation_rgb):
-        state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.blend.equation_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_source_rgb):
-        state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_dest_rgb):
-        state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_rgb);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.equation_a):
-        state.blend.a_equation = MaxwellToGL::BlendEquation(regs.blend.equation_a);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_source_a):
-        state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_a);
-        break;
-    case MAXWELL3D_REG_INDEX(blend.factor_dest_a):
-        state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_a);
-        break;
-    }
-}
+void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {}
 
 void RasterizerOpenGL::FlushAll() {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
@@ -654,7 +607,16 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
         buffer_draw_state.bindpoint = current_bindpoint + bindpoint;
 
         boost::optional<VAddr> addr = gpu.memory_manager->GpuToCpuAddress(buffer.address);
-        std::vector<u8> data(used_buffer.GetSize() * sizeof(float));
+
+        std::vector<u8> data;
+        if (used_buffer.IsIndirect()) {
+            // Buffer is accessed indirectly, so upload the entire thing
+            data.resize(buffer.size * sizeof(float));
+        } else {
+            // Buffer is accessed directly, upload just what we use
+            data.resize(used_buffer.GetSize() * sizeof(float));
+        }
+
         Memory::ReadBlock(*addr, data.data(), data.size());
 
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer_draw_state.ssbo);
@@ -671,7 +633,53 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
 
     state.Apply();
 
-    return current_bindpoint + entries.size();
+    return current_bindpoint + static_cast<u32>(entries.size());
+}
+
+u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
+                                    const std::vector<GLShader::SamplerEntry>& entries) {
+    auto& gpu = Core::System::GetInstance().GPU();
+    auto& maxwell3d = gpu.Get3DEngine();
+
+    ASSERT_MSG(maxwell3d.IsShaderStageEnabled(stage),
+               "Attempted to upload textures of disabled shader stage");
+
+    ASSERT_MSG(current_unit + entries.size() <= std::size(state.texture_units),
+               "Exceeded the number of active textures.");
+
+    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
+        const auto& entry = entries[bindpoint];
+        u32 current_bindpoint = current_unit + bindpoint;
+
+        // Bind the uniform to the sampler.
+        GLint uniform = glGetUniformLocation(program, entry.GetName().c_str());
+        ASSERT(uniform != -1);
+        glProgramUniform1i(program, uniform, current_bindpoint);
+
+        const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset());
+        ASSERT(texture.enabled);
+
+        texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
+        Surface surface = res_cache.GetTextureSurface(texture);
+        if (surface != nullptr) {
+            state.texture_units[current_bindpoint].texture_2d = surface->texture.handle;
+            state.texture_units[current_bindpoint].swizzle.r =
+                MaxwellToGL::SwizzleSource(texture.tic.x_source);
+            state.texture_units[current_bindpoint].swizzle.g =
+                MaxwellToGL::SwizzleSource(texture.tic.y_source);
+            state.texture_units[current_bindpoint].swizzle.b =
+                MaxwellToGL::SwizzleSource(texture.tic.z_source);
+            state.texture_units[current_bindpoint].swizzle.a =
+                MaxwellToGL::SwizzleSource(texture.tic.w_source);
+        } else {
+            // Can occur when texture addr is null or its memory is unmapped/invalid
+            state.texture_units[current_bindpoint].texture_2d = 0;
+        }
+    }
+
+    state.Apply();
+
+    return current_unit + static_cast<u32>(entries.size());
 }
 
 void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
@@ -730,14 +738,21 @@ void RasterizerOpenGL::SyncDepthOffset() {
     UNREACHABLE();
 }
 
-void RasterizerOpenGL::SyncBlendEnabled() {
-    UNREACHABLE();
-}
+void RasterizerOpenGL::SyncBlendState() {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
-void RasterizerOpenGL::SyncBlendFuncs() {
-    UNREACHABLE();
-}
+    // TODO(Subv): Support more than just render target 0.
+    state.blend.enabled = regs.blend.enable[0] != 0;
 
-void RasterizerOpenGL::SyncBlendColor() {
-    UNREACHABLE();
+    if (!state.blend.enabled)
+        return;
+
+    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
+    ASSERT_MSG(!regs.independent_blend[0].separate_alpha, "Unimplemented");
+    state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_rgb);
+    state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_rgb);
+    state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_rgb);
+    state.blend.a_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_a);
+    state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_a);
+    state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_a);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 4b915c76a..b7c8cf843 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -80,9 +80,6 @@ private:
     void BindFramebufferSurfaces(const Surface& color_surface, const Surface& depth_surface,
                                  bool has_stencil);
 
-    /// Binds the required textures to OpenGL before drawing a batch.
-    void BindTextures();
-
     /*
      * Configures the current constbuffers to use for the draw command.
      * @param stage The shader stage to configure buffers for.
@@ -95,6 +92,17 @@ private:
                           u32 current_bindpoint,
                           const std::vector<GLShader::ConstBufferEntry>& entries);
 
+    /*
+     * Configures the current textures to use for the draw command.
+     * @param stage The shader stage to configure textures for.
+     * @param program The OpenGL program object that contains the specified stage.
+     * @param current_unit The offset at which to start counting unused texture units.
+     * @param entries Vector describing the textures that are actually used in the guest shader.
+     * @returns The next available bindpoint for use in the next shader stage.
+     */
+    u32 SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, GLuint program,
+                      u32 current_unit, const std::vector<GLShader::SamplerEntry>& entries);
+
     /// Syncs the viewport to match the guest state
     void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect, u16 res_scale);
 
@@ -113,14 +121,8 @@ private:
     /// Syncs the depth offset to match the guest state
     void SyncDepthOffset();
 
-    /// Syncs the blend enabled status to match the guest state
-    void SyncBlendEnabled();
-
-    /// Syncs the blend functions to match the guest state
-    void SyncBlendFuncs();
-
-    /// Syncs the blend color to match the guest state
-    void SyncBlendColor();
+    /// Syncs the blend state to match the guest state
+    void SyncBlendState();
 
     bool has_ARB_buffer_storage;
     bool has_ARB_direct_state_access;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index d6048f639..61d670dcb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -28,6 +28,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
@@ -50,18 +51,22 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, false},                // A1B5G5R5
     {GL_R8, GL_RED, GL_UNSIGNED_BYTE, false},                                   // R8
     {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                // RGBA16F
+    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},        // R11FG11FB10F
     {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, true},   // DXT1
     {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT23
     {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT45
     {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, true},           // DXN1
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                               // ASTC_2D_4X4
 }};
 
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
     const SurfaceType type = SurfaceParams::GetFormatType(pixel_format);
     if (type == SurfaceType::ColorTexture) {
         ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
-        // For now only UNORM components are supported, or RGBA16F which is type FLOAT
-        ASSERT(component_type == ComponentType::UNorm || pixel_format == PixelFormat::RGBA16F);
+        // For now only UNORM components are supported, or either R11FG11FB10F or RGBA16F which are
+        // type FLOAT
+        ASSERT(component_type == ComponentType::UNorm || pixel_format == PixelFormat::RGBA16F ||
+               pixel_format == PixelFormat::R11FG11FB10F);
         return tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
         // TODO(Subv): Implement depth formats
@@ -83,6 +88,23 @@ static u16 GetResolutionScaleFactor() {
                                 : Settings::values.resolution_factor);
 }
 
+static void ConvertASTCToRGBA8(std::vector<u8>& data, PixelFormat format, u32 width, u32 height) {
+    u32 block_width{};
+    u32 block_height{};
+
+    switch (format) {
+    case PixelFormat::ASTC_2D_4X4:
+        block_width = 4;
+        block_height = 4;
+        break;
+    default:
+        NGLOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
+        UNREACHABLE();
+    }
+
+    data = Tegra::Texture::ASTC::Decompress(data, width, height, block_width, block_height);
+}
+
 template <bool morton_to_gl, PixelFormat format>
 void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr base,
                 Tegra::GPUVAddr start, Tegra::GPUVAddr end) {
@@ -94,6 +116,12 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::
         auto data = Tegra::Texture::UnswizzleTexture(
             *gpu.memory_manager->GpuToCpuAddress(base),
             SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
+
+        if (SurfaceParams::IsFormatASTC(format)) {
+            // ASTC formats are converted to RGBA8 in software, as most PC GPUs do not support this
+            ConvertASTCToRGBA8(data, format, stride, height);
+        }
+
         std::memcpy(gl_buffer, data.data(), data.size());
     } else {
         // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
@@ -110,11 +138,12 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
                                      Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
-        MortonCopy<true, PixelFormat::ABGR8>,       MortonCopy<true, PixelFormat::B5G6R5>,
-        MortonCopy<true, PixelFormat::A2B10G10R10>, MortonCopy<true, PixelFormat::A1B5G5R5>,
-        MortonCopy<true, PixelFormat::R8>,          MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::DXT1>,        MortonCopy<true, PixelFormat::DXT23>,
-        MortonCopy<true, PixelFormat::DXT45>,       MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::ABGR8>,        MortonCopy<true, PixelFormat::B5G6R5>,
+        MortonCopy<true, PixelFormat::A2B10G10R10>,  MortonCopy<true, PixelFormat::A1B5G5R5>,
+        MortonCopy<true, PixelFormat::R8>,           MortonCopy<true, PixelFormat::RGBA16F>,
+        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::DXT1>,
+        MortonCopy<true, PixelFormat::DXT23>,        MortonCopy<true, PixelFormat::DXT45>,
+        MortonCopy<true, PixelFormat::DXN1>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
 };
 
 static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra::GPUVAddr,
@@ -127,11 +156,13 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
         MortonCopy<false, PixelFormat::A1B5G5R5>,
         MortonCopy<false, PixelFormat::R8>,
         MortonCopy<false, PixelFormat::RGBA16F>,
+        MortonCopy<false, PixelFormat::R11FG11FB10F>,
         // TODO(Subv): Swizzling the DXT1/DXT23/DXT45/DXN1 formats is not yet supported
         nullptr,
         nullptr,
         nullptr,
         nullptr,
+        MortonCopy<false, PixelFormat::ABGR8>,
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -164,60 +195,10 @@ static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tup
 static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex,
                          const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type,
                          GLuint read_fb_handle, GLuint draw_fb_handle) {
-    OpenGLState state = OpenGLState::GetCurState();
-
-    OpenGLState prev_state = state;
-    SCOPE_EXIT({ prev_state.Apply(); });
-
-    // Make sure textures aren't bound to texture units, since going to bind them to framebuffer
-    // components
-    state.ResetTexture(src_tex);
-    state.ResetTexture(dst_tex);
-
-    state.draw.read_framebuffer = read_fb_handle;
-    state.draw.draw_framebuffer = draw_fb_handle;
-    state.Apply();
-
-    u32 buffers = 0;
-
-    if (type == SurfaceType::ColorTexture) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src_tex,
-                               0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, dst_tex,
-                               0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
-                               0);
-
-        buffers = GL_COLOR_BUFFER_BIT;
-    } else if (type == SurfaceType::Depth) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, src_tex, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, dst_tex, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-
-        buffers = GL_DEPTH_BUFFER_BIT;
-    } else if (type == SurfaceType::DepthStencil) {
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                               src_tex, 0);
-
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                               dst_tex, 0);
-
-        buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
-    }
-
-    glBlitFramebuffer(src_rect.left, src_rect.bottom, src_rect.right, src_rect.top, dst_rect.left,
-                      dst_rect.bottom, dst_rect.right, dst_rect.top, buffers,
-                      buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
 
+    glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0, dst_tex,
+                       GL_TEXTURE_2D, 0, dst_rect.left, dst_rect.bottom, 0, src_rect.GetWidth(),
+                       src_rect.GetHeight(), 0);
     return true;
 }
 
@@ -594,7 +575,7 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
         glCompressedTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format,
                                static_cast<GLsizei>(rect.GetWidth() * GetCompresssionFactor()),
                                static_cast<GLsizei>(rect.GetHeight() * GetCompresssionFactor()), 0,
-                               size, &gl_buffer[buffer_offset]);
+                               static_cast<GLsizei>(size), &gl_buffer[buffer_offset]);
     } else {
         glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
                         static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
@@ -933,9 +914,6 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, ScaleMatc
     // Use GetSurfaceSubRect instead
     ASSERT(params.width == params.stride);
 
-    ASSERT(!params.is_tiled ||
-           (params.GetActualWidth() % 8 == 0 && params.GetActualHeight() % 8 == 0));
-
     // Check for an exact match in existing surfaces
     Surface surface =
         FindMatch<MatchFlags::Exact | MatchFlags::Invalid>(surface_cache, params, match_res_scale);
@@ -1078,8 +1056,11 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
     params.addr = config.tic.Address();
     params.is_tiled = config.tic.IsTiled();
     params.pixel_format = SurfaceParams::PixelFormatFromTextureFormat(config.tic.format);
-    params.width = config.tic.Width() / params.GetCompresssionFactor();
-    params.height = config.tic.Height() / params.GetCompresssionFactor();
+
+    params.width = Common::AlignUp(config.tic.Width(), params.GetCompresssionFactor()) /
+                   params.GetCompresssionFactor();
+    params.height = Common::AlignUp(config.tic.Height(), params.GetCompresssionFactor()) /
+                    params.GetCompresssionFactor();
 
     // TODO(Subv): Different types per component are not supported.
     ASSERT(config.tic.r_type.Value() == config.tic.g_type.Value() &&
@@ -1090,6 +1071,13 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
 
     if (config.tic.IsTiled()) {
         params.block_height = config.tic.BlockHeight();
+
+        // TODO(bunnei): The below align up is a hack. This is here because some compressed textures
+        // are not a multiple of their own compression factor, and so this accounts for that. This
+        // could potentially result in an extra row of 4px being decoded if a texture is not a
+        // multiple of 4.
+        params.width = Common::AlignUp(params.width, 4);
+        params.height = Common::AlignUp(params.height, 4);
     } else {
         // Use the texture-provided stride value if the texture isn't tiled.
         params.stride = static_cast<u32>(params.PixelsInBytes(config.tic.Pitch()));
@@ -1097,23 +1085,6 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
 
     params.UpdateParams();
 
-    if (config.tic.Width() % 8 != 0 || config.tic.Height() % 8 != 0 ||
-        params.stride != params.width) {
-        Surface src_surface;
-        MathUtil::Rectangle<u32> rect;
-        std::tie(src_surface, rect) = GetSurfaceSubRect(params, ScaleMatch::Ignore, true);
-
-        params.res_scale = src_surface->res_scale;
-        Surface tmp_surface = CreateSurface(params);
-        BlitTextures(src_surface->texture.handle, rect, tmp_surface->texture.handle,
-                     tmp_surface->GetScaledRect(),
-                     SurfaceParams::GetFormatType(params.pixel_format), read_framebuffer.handle,
-                     draw_framebuffer.handle);
-
-        remove_surfaces.emplace(tmp_surface);
-        return tmp_surface;
-    }
-
     return GetSurface(params, ScaleMatch::Ignore, true);
 }
 
@@ -1288,7 +1259,7 @@ void RasterizerCacheOpenGL::ValidateSurface(const Surface& surface, Tegra::GPUVA
 
         const auto interval = *it & validate_interval;
         // Look for a valid surface to copy from
-        SurfaceParams params = surface->FromInterval(interval);
+        SurfaceParams params = *surface;
 
         Surface copy_surface =
             FindMatch<MatchFlags::Copy>(surface_cache, params, ScaleMatch::Ignore, interval);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 6f08678ab..9da945e19 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -60,10 +60,12 @@ struct SurfaceParams {
         A1B5G5R5 = 3,
         R8 = 4,
         RGBA16F = 5,
-        DXT1 = 6,
-        DXT23 = 7,
-        DXT45 = 8,
-        DXN1 = 9, // This is also known as BC4
+        R11FG11FB10F = 6,
+        DXT1 = 7,
+        DXT23 = 8,
+        DXT45 = 9,
+        DXN1 = 10, // This is also known as BC4
+        ASTC_2D_4X4 = 11,
 
         Max,
         Invalid = 255,
@@ -104,11 +106,13 @@ struct SurfaceParams {
             1, // A2B10G10R10
             1, // A1B5G5R5
             1, // R8
-            2, // RGBA16F
+            1, // RGBA16F
+            1, // R11FG11FB10F
             4, // DXT1
             4, // DXT23
             4, // DXT45
             4, // DXN1
+            1, // ASTC_2D_4X4
         }};
 
         ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
@@ -129,10 +133,12 @@ struct SurfaceParams {
             16,  // A1B5G5R5
             8,   // R8
             64,  // RGBA16F
+            32,  // R11FG11FB10F
             64,  // DXT1
             128, // DXT23
             128, // DXT45
             64,  // DXN1
+            32,  // ASTC_2D_4X4
         }};
 
         ASSERT(static_cast<size_t>(format) < bpp_table.size());
@@ -151,12 +157,23 @@ struct SurfaceParams {
             return PixelFormat::A2B10G10R10;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
             return PixelFormat::RGBA16F;
+        case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
+            return PixelFormat::R11FG11FB10F;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
 
+    static bool IsFormatASTC(PixelFormat format) {
+        switch (format) {
+        case PixelFormat::ASTC_2D_4X4:
+            return true;
+        default:
+            return false;
+        }
+    }
+
     static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
         switch (format) {
         case Tegra::FramebufferConfig::PixelFormat::ABGR8:
@@ -182,6 +199,8 @@ struct SurfaceParams {
             return PixelFormat::R8;
         case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
             return PixelFormat::RGBA16F;
+        case Tegra::Texture::TextureFormat::BF10GF11RF11:
+            return PixelFormat::R11FG11FB10F;
         case Tegra::Texture::TextureFormat::DXT1:
             return PixelFormat::DXT1;
         case Tegra::Texture::TextureFormat::DXT23:
@@ -190,6 +209,8 @@ struct SurfaceParams {
             return PixelFormat::DXT45;
         case Tegra::Texture::TextureFormat::DXN1:
             return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
+            return PixelFormat::ASTC_2D_4X4;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
@@ -211,6 +232,8 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::R8;
         case PixelFormat::RGBA16F:
             return Tegra::Texture::TextureFormat::R16_G16_B16_A16;
+        case PixelFormat::R11FG11FB10F:
+            return Tegra::Texture::TextureFormat::BF10GF11RF11;
         case PixelFormat::DXT1:
             return Tegra::Texture::TextureFormat::DXT1;
         case PixelFormat::DXT23:
@@ -219,6 +242,8 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::DXT45;
         case PixelFormat::DXN1:
             return Tegra::Texture::TextureFormat::DXN1;
+        case PixelFormat::ASTC_2D_4X4:
+            return Tegra::Texture::TextureFormat::ASTC_2D_4X4;
         default:
             UNREACHABLE();
         }
@@ -243,6 +268,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
             return ComponentType::UNorm;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
+        case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return ComponentType::Float;
         default:
             NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f886e49ca..65fed77ef 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -16,11 +16,11 @@ namespace Decompiler {
 
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
+using Tegra::Shader::LogicOperation;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 using Tegra::Shader::Sampler;
 using Tegra::Shader::SubOp;
-using Tegra::Shader::Uniform;
 
 constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH;
 
@@ -267,6 +267,27 @@ public:
     }
 
     /**
+     * Returns code that does an integer size conversion for the specified size.
+     * @param value Value to perform integer size conversion on.
+     * @param size Register size to use for conversion instructions.
+     * @returns GLSL string corresponding to the value converted to the specified size.
+     */
+    static std::string ConvertIntegerSize(const std::string& value, Register::Size size) {
+        switch (size) {
+        case Register::Size::Byte:
+            return "((" + value + " << 24) >> 24)";
+        case Register::Size::Short:
+            return "((" + value + " << 16) >> 16)";
+        case Register::Size::Word:
+            // Default - do nothing
+            return value;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented conversion size {}", static_cast<u32>(size));
+            UNREACHABLE();
+        }
+    }
+
+    /**
      * Gets a register as an float.
      * @param reg The register to get.
      * @param elem The element to use for the operation.
@@ -282,15 +303,18 @@ public:
      * @param reg The register to get.
      * @param elem The element to use for the operation.
      * @param is_signed Whether to get the register as a signed (or unsigned) integer.
+     * @param size Register size to use for conversion instructions.
      * @returns GLSL string corresponding to the register as an integer.
      */
-    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0,
-                                     bool is_signed = true) {
+    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0, bool is_signed = true,
+                                     Register::Size size = Register::Size::Word) {
         const std::string func = GetGLSLConversionFunc(
             GLSLRegister::Type::Float,
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger);
 
-        return func + '(' + GetRegister(reg, elem) + ')';
+        std::string value = func + '(' + GetRegister(reg, elem) + ')';
+
+        return ConvertIntegerSize(value, size);
     }
 
     /**
@@ -300,13 +324,15 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value,
-                            u64 dest_num_components, u64 value_num_components, bool is_abs = false,
-                            u64 dest_elem = 0) {
-        SetRegister(reg, elem, value, dest_num_components, value_num_components, is_abs, dest_elem);
+                            u64 dest_num_components, u64 value_num_components,
+                            bool is_saturated = false, u64 dest_elem = 0) {
+
+        SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value,
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -316,18 +342,22 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
+     * @param size Register size to use for conversion instructions.
      */
     void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                               const std::string& value, u64 dest_num_components,
-                              u64 value_num_components, bool is_abs = false, u64 dest_elem = 0) {
+                              u64 value_num_components, bool is_saturated = false,
+                              u64 dest_elem = 0, Register::Size size = Register::Size::Word) {
+        ASSERT_MSG(!is_saturated, "Unimplemented");
+
         const std::string func = GetGLSLConversionFunc(
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger,
             GLSLRegister::Type::Float);
 
-        SetRegister(reg, elem, func + '(' + value + ')', dest_num_components, value_num_components,
-                    is_abs, dest_elem);
+        SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')',
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -365,11 +395,9 @@ public:
     }
 
     /// Generates code representing a uniform (C buffer) register, interpreted as the input type.
-    std::string GetUniform(const Uniform& uniform, GLSLRegister::Type type) {
-        declr_const_buffers[uniform.index].MarkAsUsed(static_cast<unsigned>(uniform.index),
-                                                      static_cast<unsigned>(uniform.offset), stage);
-        std::string value =
-            'c' + std::to_string(uniform.index) + '[' + std::to_string(uniform.offset) + ']';
+    std::string GetUniform(u64 index, u64 offset, GLSLRegister::Type type) {
+        declr_const_buffers[index].MarkAsUsed(index, offset, stage);
+        std::string value = 'c' + std::to_string(index) + '[' + std::to_string(offset) + ']';
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -380,10 +408,19 @@ public:
         }
     }
 
-    /// Generates code representing a uniform (C buffer) register, interpreted as the type of the
-    /// destination register.
-    std::string GetUniform(const Uniform& uniform, const Register& dest_reg) {
-        return GetUniform(uniform, regs[dest_reg].GetActiveType());
+    std::string GetUniformIndirect(u64 index, s64 offset, const Register& index_reg,
+                                   GLSLRegister::Type type) {
+        declr_const_buffers[index].MarkAsUsedIndirect(index, stage);
+        std::string value = 'c' + std::to_string(index) + "[(floatBitsToInt(" +
+                            GetRegister(index_reg, 0) + ") + " + std::to_string(offset) + ") / 4]";
+
+        if (type == GLSLRegister::Type::Float) {
+            return value;
+        } else if (type == GLSLRegister::Type::Integer) {
+            return "floatBitsToInt(" + value + ')';
+        } else {
+            UNREACHABLE();
+        }
     }
 
     /// Add declarations for registers
@@ -425,6 +462,14 @@ public:
             ++const_buffer_layout;
         }
         declarations.AddNewLine();
+
+        // Append the sampler2D array for the used textures.
+        size_t num_samplers = GetSamplers().size();
+        if (num_samplers > 0) {
+            declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' +
+                                 std::to_string(num_samplers) + "];");
+            declarations.AddNewLine();
+        }
     }
 
     /// Returns a list of constant buffer declarations
@@ -435,6 +480,32 @@ public:
         return result;
     }
 
+    /// Returns a list of samplers used in the shader
+    std::vector<SamplerEntry> GetSamplers() const {
+        return used_samplers;
+    }
+
+    /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
+    /// necessary.
+    std::string AccessSampler(const Sampler& sampler) {
+        size_t offset = static_cast<size_t>(sampler.index.Value());
+
+        // If this sampler has already been used, return the existing mapping.
+        auto itr =
+            std::find_if(used_samplers.begin(), used_samplers.end(),
+                         [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
+
+        if (itr != used_samplers.end()) {
+            return itr->GetName();
+        }
+
+        // Otherwise create a new mapping for this sampler
+        size_t next_index = used_samplers.size();
+        SamplerEntry entry{stage, offset, next_index};
+        used_samplers.emplace_back(entry);
+        return entry.GetName();
+    }
+
 private:
     /// Build GLSL conversion function, e.g. floatBitsToInt, intBitsToFloat, etc.
     const std::string GetGLSLConversionFunc(GLSLRegister::Type src, GLSLRegister::Type dest) const {
@@ -460,13 +531,11 @@ private:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegister(const Register& reg, u64 elem, const std::string& value,
-                     u64 dest_num_components, u64 value_num_components, bool is_abs,
-                     u64 dest_elem) {
-        std::string dest = GetRegister(reg, dest_elem);
+                     u64 dest_num_components, u64 value_num_components, u64 dest_elem) {
+        std::string dest = GetRegister(reg, static_cast<u32>(dest_elem));
         if (dest_num_components > 1) {
             dest += GetSwizzle(elem);
         }
@@ -476,8 +545,6 @@ private:
             src += GetSwizzle(elem);
         }
 
-        src = is_abs ? "abs(" + src + ')' : src;
-
         shader.AddLine(dest + " = " + src + ';');
     }
 
@@ -498,7 +565,7 @@ private:
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
             // shader.
             ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
-            return "vec4(0, 0, gl_InstanceID, gl_VertexID)";
+            return "vec4(0, 0, uintBitsToFloat(gl_InstanceID), uintBitsToFloat(gl_VertexID))";
         default:
             const u32 index{static_cast<u32>(attribute) -
                             static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -544,6 +611,7 @@ private:
     std::set<Attribute::Index> declr_input_attribute;
     std::set<Attribute::Index> declr_output_attribute;
     std::array<ConstBufferEntry, Maxwell3D::Regs::MaxConstBuffers> declr_const_buffers;
+    std::vector<SamplerEntry> used_samplers;
     const Maxwell3D::Regs::ShaderStage& stage;
 };
 
@@ -563,7 +631,7 @@ public:
 
     /// Returns entries in the shader that are useful for external functions
     ShaderEntries GetEntries() const {
-        return {regs.GetConstBuffersDeclarations()};
+        return {regs.GetConstBuffersDeclarations(), regs.GetSamplers()};
     }
 
 private:
@@ -585,12 +653,8 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler) const {
-        // TODO(Subv): Support more than just texture sampler 0
-        ASSERT_MSG(sampler.index == Sampler::Index::Sampler_0, "unsupported");
-        const unsigned index{static_cast<unsigned>(sampler.index.Value()) -
-                             static_cast<unsigned>(Sampler::Index::Sampler_0)};
-        return "tex[" + std::to_string(index) + ']';
+    std::string GetSampler(const Sampler& sampler) {
+        return regs.AccessSampler(sampler);
     }
 
     /**
@@ -696,6 +760,31 @@ private:
         return (absolute_offset % SchedPeriod) == 0;
     }
 
+    void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
+                             const std::string& op_b) {
+        switch (logic_op) {
+        case LogicOperation::And: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Or: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Xor: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::PassB: {
+            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            break;
+        }
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
+            UNREACHABLE();
+        }
+    }
+
     /**
      * Compiles a single instruction from Tegra to GLSL.
      * @param offset the offset of the Tegra shader instruction.
@@ -733,21 +822,25 @@ private:
 
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
-            std::string op_a = instr.alu.negate_a ? "-" : "";
-            op_a += regs.GetRegisterAsFloat(instr.gpr8);
+            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
             if (instr.alu.abs_a) {
                 op_a = "abs(" + op_a + ')';
             }
 
-            std::string op_b = instr.alu.negate_b ? "-" : "";
+            if (instr.alu.negate_a) {
+                op_a = "-(" + op_a + ')';
+            }
+
+            std::string op_b;
 
             if (instr.is_b_imm) {
-                op_b += GetImmediate19(instr);
+                op_b = GetImmediate19(instr);
             } else {
                 if (instr.is_b_gpr) {
-                    op_b += regs.GetRegisterAsFloat(instr.gpr20);
+                    op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, instr.gpr0);
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Float);
                 }
             }
 
@@ -755,6 +848,10 @@ private:
                 op_b = "abs(" + op_b + ')';
             }
 
+            if (instr.alu.negate_b) {
+                op_b = "-(" + op_b + ')';
+            }
+
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
@@ -762,58 +859,49 @@ private:
                 break;
             }
 
-            case OpCode::Id::MOV32_IMM: {
-                // mov32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
-                break;
-            }
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
-                break;
-            }
-            case OpCode::Id::FMUL32_IMM: {
-                // fmul32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(
-                    instr.gpr0, 0,
-                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Sin:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Ex2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Lg2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rcp:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1,
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rsq:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Min:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "min(" + op_a + "," + op_b + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 default:
                     NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
@@ -850,52 +938,49 @@ private:
             }
             break;
         }
-        case OpCode::Type::Logic: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, false);
-
-            if (instr.alu.lop.invert_a)
-                op_a = "~(" + op_a + ')';
-
+        case OpCode::Type::ArithmeticImmediate: {
             switch (opcode->GetId()) {
-            case OpCode::Id::LOP32I: {
-                u32 imm = static_cast<u32>(instr.alu.imm20_32.Value());
+            case OpCode::Id::MOV32_IMM: {
+                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
+                break;
+            }
+            case OpCode::Id::FMUL32_IMM: {
+                regs.SetRegisterToFloat(
+                    instr.gpr0, 0,
+                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                break;
+            }
+            }
+            break;
+        }
+        case OpCode::Type::Bfe: {
+            ASSERT_MSG(!instr.bfe.negate_b, "Unimplemented");
 
-                if (instr.alu.lop.invert_b)
-                    imm = ~imm;
+            std::string op_a = instr.bfe.negate_a ? "-" : "";
+            op_a += regs.GetRegisterAsInteger(instr.gpr8);
 
-                switch (instr.alu.lop.operation) {
-                case Tegra::Shader::LogicOperation::And: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " & " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Or: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " | " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Xor: {
-                    regs.SetRegisterToInteger(instr.gpr0, false, 0,
-                                              '(' + op_a + " ^ " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented lop32i operation: {}",
-                                   static_cast<u32>(instr.alu.lop.operation.Value()));
-                    UNREACHABLE();
-                }
+            switch (opcode->GetId()) {
+            case OpCode::Id::BFE_IMM: {
+                std::string inner_shift =
+                    '(' + op_a + " << " + std::to_string(instr.bfe.GetLeftShiftValue()) + ')';
+                std::string outer_shift =
+                    '(' + inner_shift + " >> " +
+                    std::to_string(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position) + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, outer_shift, 1, 1);
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled logic instruction: {}", opcode->GetName());
+                NGLOG_CRITICAL(HW_GPU, "Unhandled BFE instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
+
             break;
         }
 
         case OpCode::Type::Shift: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, false);
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
             std::string op_b;
 
             if (instr.is_b_imm) {
@@ -904,11 +989,25 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsInteger(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
                 }
             }
 
             switch (opcode->GetId()) {
+            case OpCode::Id::SHR_C:
+            case OpCode::Id::SHR_R:
+            case OpCode::Id::SHR_IMM: {
+                if (!instr.shift.is_signed) {
+                    // Logical shift right
+                    op_a = "uint(" + op_a + ')';
+                }
+
+                // Cast to int is superfluous for arithmetic shift, it's only for a logical shift
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(" + op_a + " >> " + op_b + ')',
+                                          1, 1);
+                break;
+            }
             case OpCode::Id::SHL_C:
             case OpCode::Id::SHL_R:
             case OpCode::Id::SHL_IMM:
@@ -922,28 +1021,101 @@ private:
             break;
         }
 
-        case OpCode::Type::ScaledAdd: {
+        case OpCode::Type::ArithmeticIntegerImmediate: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b = std::to_string(instr.alu.imm20_32.Value());
 
-            if (instr.iscadd.negate_a)
-                op_a = '-' + op_a;
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD32I:
+                if (instr.iadd32i.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.iadd32i.saturate != 0);
+                break;
+            case OpCode::Id::LOP32I: {
+                if (instr.alu.lop32i.invert_a)
+                    op_a = "~(" + op_a + ')';
 
-            std::string op_b = instr.iscadd.negate_b ? "-" : "";
+                if (instr.alu.lop32i.invert_b)
+                    op_b = "~(" + op_b + ')';
 
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticIntegerImmediate instruction: {}",
+                               opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::ArithmeticInteger: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b;
             if (instr.is_b_imm) {
                 op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
             } else {
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsInteger(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
                 }
             }
 
-            std::string shift = std::to_string(instr.iscadd.shift_amount.Value());
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD_C:
+            case OpCode::Id::IADD_R:
+            case OpCode::Id::IADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.alu.saturate_d);
+                break;
+            }
+            case OpCode::Id::ISCADD_C:
+            case OpCode::Id::ISCADD_R:
+            case OpCode::Id::ISCADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
+                std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0,
+                                          "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
+                break;
+            }
+            case OpCode::Id::LOP_C:
+            case OpCode::Id::LOP_R:
+            case OpCode::Id::LOP_IMM: {
+                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
+                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
+
+                if (instr.alu.lop.invert_a)
+                    op_a = "~(" + op_a + ')';
+
+                if (instr.alu.lop.invert_b)
+                    op_b = "~(" + op_b + ')';
+
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
+                               opcode->GetName());
+                UNREACHABLE();
+            }
+            }
 
-            regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                      "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
             break;
         }
         case OpCode::Type::Ffma: {
@@ -953,7 +1125,8 @@ private:
 
             switch (opcode->GetId()) {
             case OpCode::Id::FFMA_CR: {
-                op_b += regs.GetUniform(instr.uniform, instr.gpr0);
+                op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        GLSLRegister::Type::Float);
                 op_c += regs.GetRegisterAsFloat(instr.gpr39);
                 break;
             }
@@ -964,7 +1137,8 @@ private:
             }
             case OpCode::Id::FFMA_RC: {
                 op_b += regs.GetRegisterAsFloat(instr.gpr39);
-                op_c += regs.GetUniform(instr.uniform, instr.gpr0);
+                op_c += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        GLSLRegister::Type::Float);
                 break;
             }
             case OpCode::Id::FFMA_IMM: {
@@ -978,31 +1152,33 @@ private:
             }
             }
 
-            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1);
+            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1,
+                                    instr.alu.saturate_d);
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(instr.conversion.size == Register::Size::Word, "Unimplemented");
             ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-            ASSERT_MSG(!instr.conversion.saturate_a, "Unimplemented");
 
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
 
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_signed);
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
-                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_signed, 0, op_a, 1, 1);
+                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
+                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
                 break;
             }
             case OpCode::Id::I2F_R: {
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_signed);
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
@@ -1012,13 +1188,71 @@ private:
                 break;
             }
             case OpCode::Id::F2F_R: {
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                 std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
 
+                switch (instr.conversion.f2f.rounding) {
+                case Tegra::Shader::F2fRoundingOp::None:
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Floor:
+                    op_a = "floor(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Ceil:
+                    op_a = "ceil(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2fRoundingOp::Trunc:
+                    op_a = "trunc(" + op_a + ')';
+                    break;
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2f rounding mode {}",
+                                   static_cast<u32>(instr.conversion.f2f.rounding.Value()));
+                    UNREACHABLE();
+                    break;
+                }
+
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
+                break;
+            }
+            case OpCode::Id::F2I_R: {
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
+                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
+
+                if (instr.conversion.abs_a) {
+                    op_a = "abs(" + op_a + ')';
+                }
+
+                switch (instr.conversion.f2i.rounding) {
+                case Tegra::Shader::F2iRoundingOp::None:
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Floor:
+                    op_a = "floor(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Ceil:
+                    op_a = "ceil(" + op_a + ')';
+                    break;
+                case Tegra::Shader::F2iRoundingOp::Trunc:
+                    op_a = "trunc(" + op_a + ')';
+                    break;
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2i rounding mode {}",
+                                   static_cast<u32>(instr.conversion.f2i.rounding.Value()));
+                    UNREACHABLE();
+                    break;
+                }
+
+                if (instr.conversion.is_output_signed) {
+                    op_a = "int(" + op_a + ')';
+                } else {
+                    op_a = "uint(" + op_a + ')';
+                }
+
+                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
+                                          1, false, 0, instr.conversion.dest_size);
                 break;
             }
             default: {
@@ -1029,36 +1263,60 @@ private:
             break;
         }
         case OpCode::Type::Memory: {
-            const Attribute::Index attribute = instr.attribute.fmt20.index;
-
             switch (opcode->GetId()) {
             case OpCode::Id::LD_A: {
                 ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
                 regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element,
-                                                attribute);
+                                                instr.attribute.fmt20.index);
+                break;
+            }
+            case OpCode::Id::LD_C: {
+                ASSERT_MSG(instr.ld_c.unknown == 0, "Unimplemented");
+
+                std::string op_a =
+                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, instr.gpr8,
+                                            GLSLRegister::Type::Float);
+                std::string op_b =
+                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4, instr.gpr8,
+                                            GLSLRegister::Type::Float);
+
+                switch (instr.ld_c.type.Value()) {
+                case Tegra::Shader::UniformType::Single:
+                    regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                    break;
+
+                case Tegra::Shader::UniformType::Double:
+                    regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                    regs.SetRegisterToFloat(instr.gpr0.Value() + 1, 0, op_b, 1, 1);
+                    break;
+
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unhandled type: {}",
+                                   static_cast<unsigned>(instr.ld_c.type.Value()));
+                    UNREACHABLE();
+                }
                 break;
             }
             case OpCode::Id::ST_A: {
                 ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetOutputAttributeToRegister(attribute, instr.attribute.fmt20.element,
-                                                  instr.gpr0);
+                regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index,
+                                                  instr.attribute.fmt20.element, instr.gpr0);
                 break;
             }
             case OpCode::Id::TEX: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 4, "untested");
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                 const std::string sampler = GetSampler(instr.sampler);
                 const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
-                // Add an extra scope and declare the texture coords inside to prevent overwriting
-                // them in case they are used as outputs of the texs instruction.
+                // Add an extra scope and declare the texture coords inside to prevent
+                // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
                 ++shader.scope;
                 shader.AddLine(coord);
                 const std::string texture = "texture(" + sampler + ", coords)";
 
                 size_t dest_elem{};
-                for (size_t elem = 0; elem < instr.attribute.fmt20.size; ++elem) {
+                for (size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -1071,7 +1329,6 @@ private:
                 break;
             }
             case OpCode::Id::TEXS: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 4, "untested");
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 const std::string sampler = GetSampler(instr.sampler);
@@ -1083,8 +1340,8 @@ private:
                 shader.AddLine(coord);
                 const std::string texture = "texture(" + sampler + ", coords)";
 
-                // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA goes
-                // into gpr28+0 and gpr28+1
+                // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA
+                // goes into gpr28+0 and gpr28+1
                 size_t offset{};
 
                 for (const auto& dest : {instr.gpr0.Value(), instr.gpr28.Value()}) {
@@ -1134,7 +1391,8 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Float);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Float);
                 }
             }
 
@@ -1167,15 +1425,17 @@ private:
         }
         case OpCode::Type::IntegerSetPredicate: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed);
+            std::string op_b;
 
-            std::string op_b{};
-
-            ASSERT_MSG(!instr.is_b_imm, "ISETP_IMM not implemented");
-
-            if (instr.is_b_gpr) {
-                op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.isetp.is_signed);
+            if (instr.is_b_imm) {
+                op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
             } else {
-                op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Integer);
+                if (instr.is_b_gpr) {
+                    op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.isetp.is_signed);
+                } else {
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Integer);
+                }
             }
 
             using Tegra::Shader::Pred;
@@ -1221,7 +1481,8 @@ private:
                 if (instr.is_b_gpr) {
                     op_b += regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.uniform, GLSLRegister::Type::Float);
+                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                            GLSLRegister::Type::Float);
                 }
             }
 
@@ -1229,8 +1490,8 @@ private:
                 op_b = "abs(" + op_b + ')';
             }
 
-            // The fset instruction sets a register to 1.0 if the condition is true, and to 0
-            // otherwise.
+            // The fset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
+            // condition is true, and to 0 otherwise.
             std::string second_pred =
                 GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0);
 
@@ -1248,6 +1509,41 @@ private:
             }
             break;
         }
+        case OpCode::Type::IntegerSet: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed);
+
+            std::string op_b;
+
+            if (instr.is_b_imm) {
+                op_b = std::to_string(instr.alu.GetSignedImm20_20());
+            } else {
+                if (instr.is_b_gpr) {
+                    op_b = regs.GetRegisterAsInteger(instr.gpr20, 0, instr.iset.is_signed);
+                } else {
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Integer);
+                }
+            }
+
+            // The iset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the
+            // condition is true, and to 0 otherwise.
+            std::string second_pred =
+                GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0);
+
+            std::string comparator = GetPredicateComparison(instr.iset.cond);
+            std::string combiner = GetPredicateCombiner(instr.iset.op);
+
+            std::string predicate = "(((" + op_a + ") " + comparator + " (" + op_b + ")) " +
+                                    combiner + " (" + second_pred + "))";
+
+            if (instr.iset.bf) {
+                regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
+            } else {
+                regs.SetRegisterToInteger(instr.gpr0, false, 0, predicate + " ? 0xFFFFFFFF : 0", 1,
+                                          1);
+            }
+            break;
+        }
         default: {
             switch (opcode->GetId()) {
             case OpCode::Id::EXIT: {
@@ -1261,8 +1557,8 @@ private:
 
                 shader.AddLine("return true;");
                 if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
-                    // If this is an unconditional exit then just end processing here, otherwise we
-                    // have to account for the possibility of the condition not being met, so
+                    // If this is an unconditional exit then just end processing here, otherwise
+                    // we have to account for the possibility of the condition not being met, so
                     // continue processing the next instruction.
                     offset = PROGRAM_END - 1;
                 }
@@ -1284,6 +1580,11 @@ private:
                 regs.SetRegisterToInputAttibute(instr.gpr0, attribute.element, attribute.index);
                 break;
             }
+            case OpCode::Id::SSY: {
+                // The SSY opcode tells the GPU where to re-converge divergent execution paths, we
+                // can ignore this when generating GLSL code.
+                break;
+            }
             default: {
                 NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
                 UNREACHABLE();
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 254f6e2c3..c1e6fac9f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -39,6 +39,10 @@ void main() {
     // Viewport can be flipped, which is unsupported by glViewport
     position.xy *= viewport_flip.xy;
     gl_Position = position;
+
+    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
+    // For now, this is here to bring order in lieu of proper emulation
+    position.w = 1.0;
 }
 )";
     out += program.first;
@@ -62,8 +66,6 @@ layout (std140) uniform fs_config {
     vec4 viewport_flip;
 };
 
-uniform sampler2D tex[32];
-
 void main() {
     exec_shader();
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 458032b5c..ed890e0f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -22,17 +22,28 @@ class ConstBufferEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    void MarkAsUsed(unsigned index, unsigned offset, Maxwell::ShaderStage stage) {
+    void MarkAsUsed(u64 index, u64 offset, Maxwell::ShaderStage stage) {
         is_used = true;
-        this->index = index;
+        this->index = static_cast<unsigned>(index);
+        this->stage = stage;
+        max_offset = std::max(max_offset, static_cast<unsigned>(offset));
+    }
+
+    void MarkAsUsedIndirect(u64 index, Maxwell::ShaderStage stage) {
+        is_used = true;
+        is_indirect = true;
+        this->index = static_cast<unsigned>(index);
         this->stage = stage;
-        max_offset = std::max(max_offset, offset);
     }
 
     bool IsUsed() const {
         return is_used;
     }
 
+    bool IsIndirect() const {
+        return is_indirect;
+    }
+
     unsigned GetIndex() const {
         return index;
     }
@@ -51,13 +62,54 @@ private:
     };
 
     bool is_used{};
+    bool is_indirect{};
     unsigned index{};
     unsigned max_offset{};
     Maxwell::ShaderStage stage;
 };
 
+class SamplerEntry {
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+public:
+    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index)
+        : offset(offset), stage(stage), sampler_index(index) {}
+
+    size_t GetOffset() const {
+        return offset;
+    }
+
+    size_t GetIndex() const {
+        return sampler_index;
+    }
+
+    Maxwell::ShaderStage GetStage() const {
+        return stage;
+    }
+
+    std::string GetName() const {
+        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' +
+               std::to_string(sampler_index) + ']';
+    }
+
+    static std::string GetArrayName(Maxwell::ShaderStage stage) {
+        return TextureSamplerNames[static_cast<size_t>(stage)];
+    }
+
+private:
+    static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = {
+        "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs",
+    };
+    /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
+    /// instruction.
+    size_t offset;
+    Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used.
+    size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+};
+
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffer_entries;
+    std::vector<SamplerEntry> texture_samplers;
 };
 
 using ProgramResult = std::pair<std::string, ShaderEntries>;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index ccdfc2718..d7167b298 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -32,33 +32,14 @@ void SetShaderUniformBlockBindings(GLuint shader) {
                                  sizeof(MaxwellUniformData));
 }
 
-void SetShaderSamplerBindings(GLuint shader) {
-    OpenGLState cur_state = OpenGLState::GetCurState();
-    GLuint old_program = std::exchange(cur_state.draw.shader_program, shader);
-    cur_state.Apply();
-
-    // Set the texture samplers to correspond to different texture units
-    for (u32 texture = 0; texture < NumTextureSamplers; ++texture) {
-        // Set the texture samplers to correspond to different texture units
-        std::string uniform_name = "tex[" + std::to_string(texture) + "]";
-        GLint uniform_tex = glGetUniformLocation(shader, uniform_name.c_str());
-        if (uniform_tex != -1) {
-            glUniform1i(uniform_tex, TextureUnits::MaxwellTexture(texture).id);
-        }
-    }
-
-    cur_state.draw.shader_program = old_program;
-    cur_state.Apply();
-}
-
 } // namespace Impl
 
 void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
     // TODO(bunnei): Support more than one viewport
-    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0 : 1.0;
-    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0 : 1.0;
+    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
+    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 }
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index e963b4b7e..4295c20a6 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -45,7 +45,6 @@ public:
         shader.Create(program_result.first.c_str(), type);
         program.Create(true, shader.handle);
         Impl::SetShaderUniformBlockBindings(program.handle);
-        Impl::SetShaderSamplerBindings(program.handle);
         entries = program_result.second;
     }
     GLuint GetHandle() const {
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index f91dfe36a..1f1e48425 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -50,6 +50,10 @@ OpenGLState::OpenGLState() {
     for (auto& texture_unit : texture_units) {
         texture_unit.texture_2d = 0;
         texture_unit.sampler = 0;
+        texture_unit.swizzle.r = GL_RED;
+        texture_unit.swizzle.g = GL_GREEN;
+        texture_unit.swizzle.b = GL_BLUE;
+        texture_unit.swizzle.a = GL_ALPHA;
     }
 
     lighting_lut.texture_buffer = 0;
@@ -192,13 +196,22 @@ void OpenGLState::Apply() const {
     }
 
     // Textures
-    for (size_t i = 0; i < std::size(texture_units); ++i) {
+    for (int i = 0; i < std::size(texture_units); ++i) {
         if (texture_units[i].texture_2d != cur_state.texture_units[i].texture_2d) {
             glActiveTexture(TextureUnits::MaxwellTexture(i).Enum());
             glBindTexture(GL_TEXTURE_2D, texture_units[i].texture_2d);
         }
         if (texture_units[i].sampler != cur_state.texture_units[i].sampler) {
-            glBindSampler(i, texture_units[i].sampler);
+            glBindSampler(static_cast<GLuint>(i), texture_units[i].sampler);
+        }
+        // Update the texture swizzle
+        if (texture_units[i].swizzle.r != cur_state.texture_units[i].swizzle.r ||
+            texture_units[i].swizzle.g != cur_state.texture_units[i].swizzle.g ||
+            texture_units[i].swizzle.b != cur_state.texture_units[i].swizzle.b ||
+            texture_units[i].swizzle.a != cur_state.texture_units[i].swizzle.a) {
+            std::array<GLint, 4> mask = {texture_units[i].swizzle.r, texture_units[i].swizzle.g,
+                                         texture_units[i].swizzle.b, texture_units[i].swizzle.a};
+            glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, mask.data());
         }
     }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 75c08e645..839e50e93 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -85,6 +85,12 @@ public:
     struct {
         GLuint texture_2d; // GL_TEXTURE_BINDING_2D
         GLuint sampler;    // GL_SAMPLER_BINDING
+        struct {
+            GLint r; // GL_TEXTURE_SWIZZLE_R
+            GLint g; // GL_TEXTURE_SWIZZLE_G
+            GLint b; // GL_TEXTURE_SWIZZLE_B
+            GLint a; // GL_TEXTURE_SWIZZLE_A
+        } swizzle;
     } texture_units[32];
 
     struct {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index a630610d8..2155fb019 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -100,6 +100,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     switch (wrap_mode) {
     case Tegra::Texture::WrapMode::Wrap:
         return GL_REPEAT;
+    case Tegra::Texture::WrapMode::Mirror:
+        return GL_MIRRORED_REPEAT;
     case Tegra::Texture::WrapMode::ClampToEdge:
         return GL_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::ClampOGL:
@@ -178,4 +180,25 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     return {};
 }
 
+inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
+    switch (source) {
+    case Tegra::Texture::SwizzleSource::Zero:
+        return GL_ZERO;
+    case Tegra::Texture::SwizzleSource::R:
+        return GL_RED;
+    case Tegra::Texture::SwizzleSource::G:
+        return GL_GREEN;
+    case Tegra::Texture::SwizzleSource::B:
+        return GL_BLUE;
+    case Tegra::Texture::SwizzleSource::A:
+        return GL_ALPHA;
+    case Tegra::Texture::SwizzleSource::OneInt:
+    case Tegra::Texture::SwizzleSource::OneFloat:
+        return GL_ONE;
+    }
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNREACHABLE();
+    return {};
+}
+
 } // namespace MaxwellToGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3440d2190..f33766bfd 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,6 +316,7 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     }};
 
     state.texture_units[0].texture_2d = screen_info.display_texture;
+    state.texture_units[0].swizzle = {GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA};
     state.Apply();
 
     glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices.data());