6 files changed, 386 insertions, 126 deletions
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 93eadde7a..fe1f55e85 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -98,7 +98,8 @@ RasterizerOpenGL::~RasterizerOpenGL() {}
 std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
                                                              GLintptr buffer_offset) {
     MICROPROFILE_SCOPE(OpenGL_VAO);
-    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
 
     state.draw.vertex_array = hw_vao.handle;
     state.draw.vertex_buffer = stream_buffer.GetHandle();
@@ -110,9 +111,13 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         if (!vertex_array.IsEnabled())
             continue;
 
-        const Tegra::GPUVAddr start = vertex_array.StartAddress();
+        Tegra::GPUVAddr start = vertex_array.StartAddress();
         const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
+        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
+            start += vertex_array.stride * (gpu.state.current_instance / vertex_array.divisor);
+        }
+
         ASSERT(end > start);
         u64 size = end - start + 1;
 
@@ -124,7 +129,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
                            vertex_array.stride);
 
-        ASSERT_MSG(vertex_array.divisor == 0, "Instanced vertex arrays are not supported");
+        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
+            // Tell OpenGL that this is an instanced vertex buffer to prevent accessing different
+            // indexes on each vertex. We do the instance indexing manually by incrementing the
+            // start address of the vertex buffer.
+            glVertexBindingDivisor(index, 1);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexBindingDivisor(index, 0);
+        }
     }
 
     // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 38aa067b6..fb7476fb8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -94,6 +94,7 @@ struct FormatTuple {
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
+    {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},   // ABGR8UI
     {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10U
@@ -245,6 +246,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<true, PixelFormat::ABGR8U>,
         MortonCopy<true, PixelFormat::ABGR8S>,
+        MortonCopy<true, PixelFormat::ABGR8UI>,
         MortonCopy<true, PixelFormat::B5G6R5U>,
         MortonCopy<true, PixelFormat::A2B10G10R10U>,
         MortonCopy<true, PixelFormat::A1B5G5R5U>,
@@ -299,6 +301,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<false, PixelFormat::ABGR8U>,
         MortonCopy<false, PixelFormat::ABGR8S>,
+        MortonCopy<false, PixelFormat::ABGR8UI>,
         MortonCopy<false, PixelFormat::B5G6R5U>,
         MortonCopy<false, PixelFormat::A2B10G10R10U>,
         MortonCopy<false, PixelFormat::A1B5G5R5U>,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index beec01746..fc8b44219 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -25,59 +25,60 @@ struct SurfaceParams {
     enum class PixelFormat {
         ABGR8U = 0,
         ABGR8S = 1,
-        B5G6R5U = 2,
-        A2B10G10R10U = 3,
-        A1B5G5R5U = 4,
-        R8U = 5,
-        R8UI = 6,
-        RGBA16F = 7,
-        RGBA16U = 8,
-        RGBA16UI = 9,
-        R11FG11FB10F = 10,
-        RGBA32UI = 11,
-        DXT1 = 12,
-        DXT23 = 13,
-        DXT45 = 14,
-        DXN1 = 15, // This is also known as BC4
-        DXN2UNORM = 16,
-        DXN2SNORM = 17,
-        BC7U = 18,
-        ASTC_2D_4X4 = 19,
-        G8R8U = 20,
-        G8R8S = 21,
-        BGRA8 = 22,
-        RGBA32F = 23,
-        RG32F = 24,
-        R32F = 25,
-        R16F = 26,
-        R16U = 27,
-        R16S = 28,
-        R16UI = 29,
-        R16I = 30,
-        RG16 = 31,
-        RG16F = 32,
-        RG16UI = 33,
-        RG16I = 34,
-        RG16S = 35,
-        RGB32F = 36,
-        SRGBA8 = 37,
-        RG8U = 38,
-        RG8S = 39,
-        RG32UI = 40,
-        R32UI = 41,
+        ABGR8UI = 2,
+        B5G6R5U = 3,
+        A2B10G10R10U = 4,
+        A1B5G5R5U = 5,
+        R8U = 6,
+        R8UI = 7,
+        RGBA16F = 8,
+        RGBA16U = 9,
+        RGBA16UI = 10,
+        R11FG11FB10F = 11,
+        RGBA32UI = 12,
+        DXT1 = 13,
+        DXT23 = 14,
+        DXT45 = 15,
+        DXN1 = 16, // This is also known as BC4
+        DXN2UNORM = 17,
+        DXN2SNORM = 18,
+        BC7U = 19,
+        ASTC_2D_4X4 = 20,
+        G8R8U = 21,
+        G8R8S = 22,
+        BGRA8 = 23,
+        RGBA32F = 24,
+        RG32F = 25,
+        R32F = 26,
+        R16F = 27,
+        R16U = 28,
+        R16S = 29,
+        R16UI = 30,
+        R16I = 31,
+        RG16 = 32,
+        RG16F = 33,
+        RG16UI = 34,
+        RG16I = 35,
+        RG16S = 36,
+        RGB32F = 37,
+        SRGBA8 = 38,
+        RG8U = 39,
+        RG8S = 40,
+        RG32UI = 41,
+        R32UI = 42,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 42,
-        Z16 = 43,
+        Z32F = 43,
+        Z16 = 44,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 44,
-        S8Z24 = 45,
-        Z32FS8 = 46,
+        Z24S8 = 45,
+        S8Z24 = 46,
+        Z32FS8 = 47,
 
         MaxDepthStencilFormat,
 
@@ -117,6 +118,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{
             1, // ABGR8U
             1, // ABGR8S
+            1, // ABGR8UI
             1, // B5G6R5U
             1, // A2B10G10R10U
             1, // A1B5G5R5U
@@ -175,6 +177,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
             32,  // ABGR8U
             32,  // ABGR8S
+            32,  // ABGR8UI
             16,  // B5G6R5U
             32,  // A2B10G10R10U
             16,  // A1B5G5R5U
@@ -257,6 +260,8 @@ struct SurfaceParams {
             return PixelFormat::ABGR8U;
         case Tegra::RenderTargetFormat::RGBA8_SNORM:
             return PixelFormat::ABGR8S;
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
+            return PixelFormat::ABGR8UI;
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
@@ -327,6 +332,8 @@ struct SurfaceParams {
                 return PixelFormat::ABGR8U;
             case Tegra::Texture::ComponentType::SNORM:
                 return PixelFormat::ABGR8S;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::ABGR8UI;
             }
             LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
                          static_cast<u32>(component_type));
@@ -551,6 +558,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R16_UINT:
         case Tegra::RenderTargetFormat::RG32_UINT:
         case Tegra::RenderTargetFormat::R32_UINT:
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
             return ComponentType::UInt;
         case Tegra::RenderTargetFormat::RG16_SINT:
         case Tegra::RenderTargetFormat::R16_SINT:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 6cc223328..214a5fa9a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -440,13 +440,12 @@ public:
         }
         declarations.AddNewLine();
 
-        // Append the sampler2D array for the used textures.
-        size_t num_samplers = GetSamplers().size();
-        if (num_samplers > 0) {
-            declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' +
-                                 std::to_string(num_samplers) + "];");
-            declarations.AddNewLine();
+        const auto& samplers = GetSamplers();
+        for (const auto& sampler : samplers) {
+            declarations.AddLine("uniform " + sampler.GetTypeString() + ' ' + sampler.GetName() +
+                                 ';');
         }
+        declarations.AddNewLine();
     }
 
     /// Returns a list of constant buffer declarations
@@ -458,13 +457,14 @@ public:
     }
 
     /// Returns a list of samplers used in the shader
-    std::vector<SamplerEntry> GetSamplers() const {
+    const std::vector<SamplerEntry>& GetSamplers() const {
         return used_samplers;
     }
 
     /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
     /// necessary.
-    std::string AccessSampler(const Sampler& sampler) {
+    std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type,
+                              bool is_array) {
         size_t offset = static_cast<size_t>(sampler.index.Value());
 
         // If this sampler has already been used, return the existing mapping.
@@ -473,12 +473,13 @@ public:
                          [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
 
         if (itr != used_samplers.end()) {
+            ASSERT(itr->GetType() == type && itr->IsArray() == is_array);
             return itr->GetName();
         }
 
         // Otherwise create a new mapping for this sampler
         size_t next_index = used_samplers.size();
-        SamplerEntry entry{stage, offset, next_index};
+        SamplerEntry entry{stage, offset, next_index, type, is_array};
         used_samplers.emplace_back(entry);
         return entry.GetName();
     }
@@ -543,6 +544,10 @@ private:
             // shader.
             ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
             return "vec4(0, 0, uintBitsToFloat(instance_id.x), uintBitsToFloat(gl_VertexID))";
+        case Attribute::Index::FrontFacing:
+            // TODO(Subv): Find out what the values are for the other elements.
+            ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment);
+            return "vec4(0, 0, 0, uintBitsToFloat(gl_FrontFacing ? 1 : 0))";
         default:
             const u32 index{static_cast<u32>(attribute) -
                             static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -652,8 +657,8 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler) {
-        return regs.AccessSampler(sampler);
+    std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array) {
+        return regs.AccessSampler(sampler, type, is_array);
     }
 
     /**
@@ -761,6 +766,30 @@ private:
         return op->second;
     }
 
+    /**
+     * Transforms the input string GLSL operand into one that applies the abs() function and negates
+     * the output if necessary. When both abs and neg are true, the negation will be applied after
+     * taking the absolute value.
+     * @param operand The input operand to take the abs() of, negate, or both.
+     * @param abs Whether to apply the abs() function to the input operand.
+     * @param neg Whether to negate the input operand.
+     * @returns String corresponding to the operand after being transformed by the abs() and
+     * negation operations.
+     */
+    static std::string GetOperandAbsNeg(const std::string& operand, bool abs, bool neg) {
+        std::string result = operand;
+
+        if (abs) {
+            result = "abs(" + result + ')';
+        }
+
+        if (neg) {
+            result = "-(" + result + ')';
+        }
+
+        return result;
+    }
+
     /*
      * Returns whether the instruction at the specified offset is a 'sched' instruction.
      * Sched instructions always appear before a sequence of 3 instructions.
@@ -774,28 +803,51 @@ private:
     }
 
     void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
-                             const std::string& op_b) {
+                             const std::string& op_b,
+                             Tegra::Shader::PredicateResultMode predicate_mode,
+                             Tegra::Shader::Pred predicate) {
+        std::string result{};
         switch (logic_op) {
         case LogicOperation::And: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            result = '(' + op_a + " & " + op_b + ')';
             break;
         }
         case LogicOperation::Or: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            result = '(' + op_a + " | " + op_b + ')';
             break;
         }
         case LogicOperation::Xor: {
-            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            result = '(' + op_a + " ^ " + op_b + ')';
             break;
         }
         case LogicOperation::PassB: {
-            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            result = op_b;
             break;
         }
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
             UNREACHABLE();
         }
+
+        if (dest != Tegra::Shader::Register::ZeroIndex) {
+            regs.SetRegisterToInteger(dest, true, 0, result, 1, 1);
+        }
+
+        using Tegra::Shader::PredicateResultMode;
+        // Write the predicate value depending on the predicate mode.
+        switch (predicate_mode) {
+        case PredicateResultMode::None:
+            // Do nothing.
+            return;
+        case PredicateResultMode::NotZero:
+            // Set the predicate to true if the result is not zero.
+            SetPredicate(static_cast<u64>(predicate), '(' + result + ") != 0");
+            break;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented predicate result mode: {}",
+                         static_cast<u32>(predicate_mode));
+            UNREACHABLE();
+        }
     }
 
     void WriteTexsInstruction(const Instruction& instr, const std::string& coord,
@@ -806,29 +858,56 @@ private:
         ++shader.scope;
         shader.AddLine(coord);
 
-        // TEXS has two destination registers. RG goes into gpr0+0 and gpr0+1, and BA
-        // goes into gpr28+0 and gpr28+1
-        size_t texs_offset{};
-
-        size_t src_elem{};
-        for (const auto& dest : {instr.gpr0.Value(), instr.gpr28.Value()}) {
-            size_t dest_elem{};
-            for (unsigned elem = 0; elem < 2; ++elem) {
-                if (!instr.texs.IsComponentEnabled(src_elem++)) {
-                    // Skip disabled components
-                    continue;
-                }
-                regs.SetRegisterToFloat(dest, elem + texs_offset, texture, 1, 4, false,
-                                        dest_elem++);
+        // TEXS has two destination registers and a swizzle. The first two elements in the swizzle
+        // go into gpr0+0 and gpr0+1, and the rest goes into gpr28+0 and gpr28+1
+
+        size_t written_components = 0;
+        for (u32 component = 0; component < 4; ++component) {
+            if (!instr.texs.IsComponentEnabled(component)) {
+                continue;
             }
 
-            if (!instr.texs.HasTwoDestinations()) {
-                // Skip the second destination
-                break;
+            if (written_components < 2) {
+                // Write the first two swizzle components to gpr0 and gpr0+1
+                regs.SetRegisterToFloat(instr.gpr0, component, texture, 1, 4, false,
+                                        written_components % 2);
+            } else {
+                ASSERT(instr.texs.HasTwoDestinations());
+                // Write the rest of the swizzle components to gpr28 and gpr28+1
+                regs.SetRegisterToFloat(instr.gpr28, component, texture, 1, 4, false,
+                                        written_components % 2);
             }
 
-            texs_offset += 2;
+            ++written_components;
         }
+
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
+    /*
+     * Emits code to push the input target address to the SSY address stack, incrementing the stack
+     * top.
+     */
+    void EmitPushToSSYStack(u32 target) {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack[ssy_stack_top] = " + std::to_string(target) + "u;");
+        shader.AddLine("ssy_stack_top++;");
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
+    /*
+     * Emits code to pop an address from the SSY address stack, setting the jump address to the
+     * popped address and decrementing the stack top.
+     */
+    void EmitPopFromSSYStack() {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack_top--;");
+        shader.AddLine("jmp_to = ssy_stack[ssy_stack_top];");
+        shader.AddLine("break;");
         --shader.scope;
         shader.AddLine('}');
     }
@@ -902,13 +981,6 @@ private:
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
             std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-            if (instr.alu.abs_a) {
-                op_a = "abs(" + op_a + ')';
-            }
-
-            if (instr.alu.negate_a) {
-                op_a = "-(" + op_a + ')';
-            }
 
             std::string op_b;
 
@@ -923,17 +995,10 @@ private:
                 }
             }
 
-            if (instr.alu.abs_b) {
-                op_b = "abs(" + op_b + ')';
-            }
-
-            if (instr.alu.negate_b) {
-                op_b = "-(" + op_b + ')';
-            }
-
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
+                // MOV does not have neither 'abs' nor 'neg' bits.
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 break;
             }
@@ -941,6 +1006,8 @@ private:
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
+                // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
+                op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
@@ -948,11 +1015,14 @@ private:
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
@@ -992,6 +1062,9 @@ private:
             case OpCode::Id::FMNMX_C:
             case OpCode::Id::FMNMX_R:
             case OpCode::Id::FMNMX_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
+
                 std::string condition =
                     GetPredicateCondition(instr.alu.fmnmx.pred, instr.alu.fmnmx.negate_pred != 0);
                 std::string parameters = op_a + ',' + op_b;
@@ -1005,7 +1078,7 @@ private:
             case OpCode::Id::RRO_R:
             case OpCode::Id::RRO_IMM: {
                 // Currently RRO is only implemented as a register move.
-                // Usage of `abs_b` and `negate_b` here should also be correct.
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 LOG_WARNING(HW_GPU, "RRO instruction is incomplete");
                 break;
@@ -1142,7 +1215,9 @@ private:
                 if (instr.alu.lop32i.invert_b)
                     op_b = "~(" + op_b + ')';
 
-                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b,
+                                    Tegra::Shader::PredicateResultMode::None,
+                                    Tegra::Shader::Pred::UnusedIndex);
                 break;
             }
             default: {
@@ -1208,16 +1283,14 @@ private:
             case OpCode::Id::LOP_C:
             case OpCode::Id::LOP_R:
             case OpCode::Id::LOP_IMM: {
-                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
-                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
-
                 if (instr.alu.lop.invert_a)
                     op_a = "~(" + op_a + ')';
 
                 if (instr.alu.lop.invert_b)
                     op_b = "~(" + op_b + ')';
 
-                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b,
+                                    instr.alu.lop.pred_result_mode, instr.alu.lop.pred48);
                 break;
             }
             case OpCode::Id::IMNMX_C:
@@ -1282,8 +1355,6 @@ private:
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
@@ -1480,10 +1551,31 @@ private:
                 break;
             }
             case OpCode::Id::TEX: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                ASSERT_MSG(instr.tex.array == 0, "TEX arrays unimplemented");
+                std::string coord{};
+
+                switch (instr.tex.texture_type) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                case Tegra::Shader::TextureType::Texture3D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(instr.tex.texture_type.Value()));
+                    UNREACHABLE();
+                }
+
+                const std::string sampler =
+                    GetSampler(instr.sampler, instr.tex.texture_type, instr.tex.array);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
@@ -1505,24 +1597,122 @@ private:
                 break;
             }
             case OpCode::Id::TEXS: {
-                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                std::string coord{};
+
+                switch (instr.texs.GetTextureType()) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (instr.texs.IsArrayTexture()) {
+                        std::string index = regs.GetRegisterAsInteger(instr.gpr8);
+                        std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                        std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");";
+                    } else {
+                        std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                        std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                case Tegra::Shader::TextureType::TextureCube: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(instr.texs.GetTextureType()));
+                    UNREACHABLE();
+                }
+                const std::string sampler = GetSampler(instr.sampler, instr.texs.GetTextureType(),
+                                                       instr.texs.IsArrayTexture());
 
                 const std::string texture = "texture(" + sampler + ", coords)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
             case OpCode::Id::TLDS: {
-                const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
-                const std::string op_b = regs.GetRegisterAsInteger(instr.gpr20);
-                const std::string sampler = GetSampler(instr.sampler);
-                const std::string coord = "ivec2 coords = ivec2(" + op_a + ", " + op_b + ");";
+                ASSERT(instr.tlds.GetTextureType() == Tegra::Shader::TextureType::Texture2D);
+                ASSERT(instr.tlds.IsArrayTexture() == false);
+                std::string coord{};
+
+                switch (instr.tlds.GetTextureType()) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    if (instr.tlds.IsArrayTexture()) {
+                        LOG_CRITICAL(HW_GPU, "Unhandled 2d array texture");
+                        UNREACHABLE();
+                    } else {
+                        std::string x = regs.GetRegisterAsInteger(instr.gpr8);
+                        std::string y = regs.GetRegisterAsInteger(instr.gpr20);
+                        coord = "ivec2 coords = ivec2(" + x + ", " + y + ");";
+                    }
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(instr.tlds.GetTextureType()));
+                    UNREACHABLE();
+                }
+                const std::string sampler = GetSampler(instr.sampler, instr.tlds.GetTextureType(),
+                                                       instr.tlds.IsArrayTexture());
                 const std::string texture = "texelFetch(" + sampler + ", coords, 0)";
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
+            case OpCode::Id::TLD4: {
+                ASSERT(instr.tld4.texture_type == Tegra::Shader::TextureType::Texture2D);
+                ASSERT(instr.tld4.array == 0);
+                std::string coord{};
+
+                switch (instr.tld4.texture_type) {
+                case Tegra::Shader::TextureType::Texture2D: {
+                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    break;
+                }
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
+                                 static_cast<u32>(instr.tld4.texture_type.Value()));
+                    UNREACHABLE();
+                }
+
+                const std::string sampler =
+                    GetSampler(instr.sampler, instr.tld4.texture_type, instr.tld4.array);
+                // Add an extra scope and declare the texture coords inside to prevent
+                // overwriting them in case they are used as outputs of the texs instruction.
+                shader.AddLine("{");
+                ++shader.scope;
+                shader.AddLine(coord);
+                const std::string texture = "textureGather(" + sampler + ", coords, " +
+                                            std::to_string(instr.tld4.component) + ')';
+
+                size_t dest_elem{};
+                for (size_t elem = 0; elem < 4; ++elem) {
+                    if (!instr.tex.IsComponentEnabled(elem)) {
+                        // Skip disabled components
+                        continue;
+                    }
+                    regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
+                    ++dest_elem;
+                }
+                --shader.scope;
+                shader.AddLine("}");
+                break;
+            }
+            case OpCode::Id::TLD4S: {
+                const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
+                const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
+                // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction.
+                const std::string sampler =
+                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
+                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                const std::string texture = "textureGather(" + sampler + ", coords, " +
+                                            std::to_string(instr.tld4s.component) + ')';
+                WriteTexsInstruction(instr, coord, texture);
+                break;
+            }
             default: {
                 LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
                 UNREACHABLE();
@@ -1882,13 +2072,13 @@ private:
                 ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported");
 
                 u32 target = offset + instr.bra.GetBranchTarget();
-                shader.AddLine("ssy_target = " + std::to_string(target) + "u;");
+                EmitPushToSSYStack(target);
                 break;
             }
             case OpCode::Id::SYNC: {
                 // The SYNC opcode jumps to the address previously set by the SSY opcode
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
-                shader.AddLine("{ jmp_to = ssy_target; break; }");
+                EmitPopFromSSYStack();
                 break;
             }
             case OpCode::Id::DEPBAR: {
@@ -1959,7 +2149,13 @@ private:
             } else {
                 labels.insert(subroutine.begin);
                 shader.AddLine("uint jmp_to = " + std::to_string(subroutine.begin) + "u;");
-                shader.AddLine("uint ssy_target = 0u;");
+
+                // TODO(Subv): Figure out the actual depth of the SSY stack, for now it seems
+                // unlikely that shaders will use 20 nested SSYs.
+                constexpr u32 SSY_STACK_SIZE = 20;
+                shader.AddLine("uint ssy_stack[" + std::to_string(SSY_STACK_SIZE) + "];");
+                shader.AddLine("uint ssy_stack_top = 0u;");
+
                 shader.AddLine("while (true) {");
                 ++shader.scope;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 4729ce0fc..db48da645 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "common/hash.h"
+#include "video_core/engines/shader_bytecode.h"
 
 namespace GLShader {
 
@@ -72,8 +73,9 @@ class SamplerEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index)
-        : offset(offset), stage(stage), sampler_index(index) {}
+    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index,
+                 Tegra::Shader::TextureType type, bool is_array)
+        : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {}
 
     size_t GetOffset() const {
         return offset;
@@ -88,8 +90,41 @@ public:
     }
 
     std::string GetName() const {
-        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' +
-               std::to_string(sampler_index) + ']';
+        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '_' +
+               std::to_string(sampler_index);
+    }
+
+    std::string GetTypeString() const {
+        using Tegra::Shader::TextureType;
+        std::string glsl_type;
+
+        switch (type) {
+        case TextureType::Texture1D:
+            glsl_type = "sampler1D";
+            break;
+        case TextureType::Texture2D:
+            glsl_type = "sampler2D";
+            break;
+        case TextureType::Texture3D:
+            glsl_type = "sampler3D";
+            break;
+        case TextureType::TextureCube:
+            glsl_type = "samplerCube";
+            break;
+        default:
+            UNIMPLEMENTED();
+        }
+        if (is_array)
+            glsl_type += "Array";
+        return glsl_type;
+    }
+
+    Tegra::Shader::TextureType GetType() const {
+        return type;
+    }
+
+    bool IsArray() const {
+        return is_array;
     }
 
     static std::string GetArrayName(Maxwell::ShaderStage stage) {
@@ -100,11 +135,14 @@ private:
     static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = {
         "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs",
     };
+
     /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
     /// instruction.
     size_t offset;
-    Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used.
-    size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
+    Maxwell::ShaderStage stage;      ///< Shader stage where this sampler was used.
+    size_t sampler_index;            ///< Value used to index into the generated GLSL sampler array.
+    Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc)
+    bool is_array; ///< Whether the texture is being sampled as an array texture or not.
 };
 
 struct ShaderEntries {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 8f719fdd8..5d91a0c2f 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -147,6 +147,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
         // manually mix them. However the shader part of this is not yet implemented.
         return GL_CLAMP_TO_BORDER;
+    case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
+        return GL_MIRROR_CLAMP_TO_EDGE;
     }
     LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
     UNREACHABLE();