7 files changed, 145 insertions, 72 deletions
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 569225ad7..3ba6fe614 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -12,6 +12,7 @@
 
 #include <boost/optional.hpp>
 
+#include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_types.h"
 
@@ -299,6 +300,10 @@ union Instruction {
     } alu;
 
     union {
+        BitField<48, 1, u64> negate_b;
+    } fmul;
+
+    union {
         BitField<48, 1, u64> is_signed;
     } shift;
 
@@ -485,16 +490,20 @@ union Instruction {
         }
 
         bool IsComponentEnabled(size_t component) const {
-            static constexpr std::array<std::array<u32, 8>, 4> mask_lut{
-                {{},
-                 {0x1, 0x2, 0x4, 0x8, 0x3},
-                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
-                 {0x7, 0xb, 0xd, 0xe, 0xf}}};
+            static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
+                {},
+                {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
+                {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
+                {0x7, 0xb, 0xd, 0xe, 0xf},
+            }};
 
             size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
             index |= gpr28.Value() != Register::ZeroIndex ? 2 : 0;
 
-            return ((1ull << component) & mask_lut[index][component_mask_selector]) != 0;
+            u32 mask = mask_lut[index][component_mask_selector];
+            // A mask of 0 means this instruction uses an unimplemented mask.
+            ASSERT(mask != 0);
+            return ((1ull << component) & mask) != 0;
         }
     } texs;
 
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 5a593c1f7..9758adcfd 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -55,6 +55,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RGBA8_UNORM:
     case RenderTargetFormat::RGBA8_SNORM:
     case RenderTargetFormat::RGBA8_SRGB:
+    case RenderTargetFormat::RGBA8_UINT:
     case RenderTargetFormat::RGB10_A2_UNORM:
     case RenderTargetFormat::BGRA8_UNORM:
     case RenderTargetFormat::RG16_UNORM:
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 97dcccb92..2697e1c27 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -30,6 +30,7 @@ enum class RenderTargetFormat : u32 {
     RGBA8_UNORM = 0xD5,
     RGBA8_SRGB = 0xD6,
     RGBA8_SNORM = 0xD7,
+    RGBA8_UINT = 0xD9,
     RG16_UNORM = 0xDA,
     RG16_SNORM = 0xDB,
     RG16_SINT = 0xDC,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 38aa067b6..fb7476fb8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -94,6 +94,7 @@ struct FormatTuple {
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
+    {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},   // ABGR8UI
     {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10U
@@ -245,6 +246,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<true, PixelFormat::ABGR8U>,
         MortonCopy<true, PixelFormat::ABGR8S>,
+        MortonCopy<true, PixelFormat::ABGR8UI>,
         MortonCopy<true, PixelFormat::B5G6R5U>,
         MortonCopy<true, PixelFormat::A2B10G10R10U>,
         MortonCopy<true, PixelFormat::A1B5G5R5U>,
@@ -299,6 +301,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         // clang-format off
         MortonCopy<false, PixelFormat::ABGR8U>,
         MortonCopy<false, PixelFormat::ABGR8S>,
+        MortonCopy<false, PixelFormat::ABGR8UI>,
         MortonCopy<false, PixelFormat::B5G6R5U>,
         MortonCopy<false, PixelFormat::A2B10G10R10U>,
         MortonCopy<false, PixelFormat::A1B5G5R5U>,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index beec01746..fc8b44219 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -25,59 +25,60 @@ struct SurfaceParams {
     enum class PixelFormat {
         ABGR8U = 0,
         ABGR8S = 1,
-        B5G6R5U = 2,
-        A2B10G10R10U = 3,
-        A1B5G5R5U = 4,
-        R8U = 5,
-        R8UI = 6,
-        RGBA16F = 7,
-        RGBA16U = 8,
-        RGBA16UI = 9,
-        R11FG11FB10F = 10,
-        RGBA32UI = 11,
-        DXT1 = 12,
-        DXT23 = 13,
-        DXT45 = 14,
-        DXN1 = 15, // This is also known as BC4
-        DXN2UNORM = 16,
-        DXN2SNORM = 17,
-        BC7U = 18,
-        ASTC_2D_4X4 = 19,
-        G8R8U = 20,
-        G8R8S = 21,
-        BGRA8 = 22,
-        RGBA32F = 23,
-        RG32F = 24,
-        R32F = 25,
-        R16F = 26,
-        R16U = 27,
-        R16S = 28,
-        R16UI = 29,
-        R16I = 30,
-        RG16 = 31,
-        RG16F = 32,
-        RG16UI = 33,
-        RG16I = 34,
-        RG16S = 35,
-        RGB32F = 36,
-        SRGBA8 = 37,
-        RG8U = 38,
-        RG8S = 39,
-        RG32UI = 40,
-        R32UI = 41,
+        ABGR8UI = 2,
+        B5G6R5U = 3,
+        A2B10G10R10U = 4,
+        A1B5G5R5U = 5,
+        R8U = 6,
+        R8UI = 7,
+        RGBA16F = 8,
+        RGBA16U = 9,
+        RGBA16UI = 10,
+        R11FG11FB10F = 11,
+        RGBA32UI = 12,
+        DXT1 = 13,
+        DXT23 = 14,
+        DXT45 = 15,
+        DXN1 = 16, // This is also known as BC4
+        DXN2UNORM = 17,
+        DXN2SNORM = 18,
+        BC7U = 19,
+        ASTC_2D_4X4 = 20,
+        G8R8U = 21,
+        G8R8S = 22,
+        BGRA8 = 23,
+        RGBA32F = 24,
+        RG32F = 25,
+        R32F = 26,
+        R16F = 27,
+        R16U = 28,
+        R16S = 29,
+        R16UI = 30,
+        R16I = 31,
+        RG16 = 32,
+        RG16F = 33,
+        RG16UI = 34,
+        RG16I = 35,
+        RG16S = 36,
+        RGB32F = 37,
+        SRGBA8 = 38,
+        RG8U = 39,
+        RG8S = 40,
+        RG32UI = 41,
+        R32UI = 42,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 42,
-        Z16 = 43,
+        Z32F = 43,
+        Z16 = 44,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 44,
-        S8Z24 = 45,
-        Z32FS8 = 46,
+        Z24S8 = 45,
+        S8Z24 = 46,
+        Z32FS8 = 47,
 
         MaxDepthStencilFormat,
 
@@ -117,6 +118,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{
             1, // ABGR8U
             1, // ABGR8S
+            1, // ABGR8UI
             1, // B5G6R5U
             1, // A2B10G10R10U
             1, // A1B5G5R5U
@@ -175,6 +177,7 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
             32,  // ABGR8U
             32,  // ABGR8S
+            32,  // ABGR8UI
             16,  // B5G6R5U
             32,  // A2B10G10R10U
             16,  // A1B5G5R5U
@@ -257,6 +260,8 @@ struct SurfaceParams {
             return PixelFormat::ABGR8U;
         case Tegra::RenderTargetFormat::RGBA8_SNORM:
             return PixelFormat::ABGR8S;
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
+            return PixelFormat::ABGR8UI;
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
@@ -327,6 +332,8 @@ struct SurfaceParams {
                 return PixelFormat::ABGR8U;
             case Tegra::Texture::ComponentType::SNORM:
                 return PixelFormat::ABGR8S;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::ABGR8UI;
             }
             LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
                          static_cast<u32>(component_type));
@@ -551,6 +558,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R16_UINT:
         case Tegra::RenderTargetFormat::RG32_UINT:
         case Tegra::RenderTargetFormat::R32_UINT:
+        case Tegra::RenderTargetFormat::RGBA8_UINT:
             return ComponentType::UInt;
         case Tegra::RenderTargetFormat::RG16_SINT:
         case Tegra::RenderTargetFormat::R16_SINT:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f466b9427..f3b2d1328 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -748,6 +748,30 @@ private:
         return op->second;
     }
 
+    /**
+     * Transforms the input string GLSL operand into one that applies the abs() function and negates
+     * the output if necessary. When both abs and neg are true, the negation will be applied after
+     * taking the absolute value.
+     * @param operand The input operand to take the abs() of, negate, or both.
+     * @param abs Whether to apply the abs() function to the input operand.
+     * @param neg Whether to negate the input operand.
+     * @returns String corresponding to the operand after being transformed by the abs() and
+     * negation operations.
+     */
+    static std::string GetOperandAbsNeg(const std::string& operand, bool abs, bool neg) {
+        std::string result = operand;
+
+        if (abs) {
+            result = "abs(" + result + ')';
+        }
+
+        if (neg) {
+            result = "-(" + result + ')';
+        }
+
+        return result;
+    }
+
     /*
      * Returns whether the instruction at the specified offset is a 'sched' instruction.
      * Sched instructions always appear before a sequence of 3 instructions.
@@ -843,6 +867,33 @@ private:
         shader.AddLine('}');
     }
 
+    /*
+     * Emits code to push the input target address to the SSY address stack, incrementing the stack
+     * top.
+     */
+    void EmitPushToSSYStack(u32 target) {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack[ssy_stack_top] = " + std::to_string(target) + "u;");
+        shader.AddLine("ssy_stack_top++;");
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
+    /*
+     * Emits code to pop an address from the SSY address stack, setting the jump address to the
+     * popped address and decrementing the stack top.
+     */
+    void EmitPopFromSSYStack() {
+        shader.AddLine('{');
+        ++shader.scope;
+        shader.AddLine("ssy_stack_top--;");
+        shader.AddLine("jmp_to = ssy_stack[ssy_stack_top];");
+        shader.AddLine("break;");
+        --shader.scope;
+        shader.AddLine('}');
+    }
+
     /**
      * Compiles a single instruction from Tegra to GLSL.
      * @param offset the offset of the Tegra shader instruction.
@@ -887,13 +938,6 @@ private:
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
             std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
-            if (instr.alu.abs_a) {
-                op_a = "abs(" + op_a + ')';
-            }
-
-            if (instr.alu.negate_a) {
-                op_a = "-(" + op_a + ')';
-            }
 
             std::string op_b;
 
@@ -908,17 +952,10 @@ private:
                 }
             }
 
-            if (instr.alu.abs_b) {
-                op_b = "abs(" + op_b + ')';
-            }
-
-            if (instr.alu.negate_b) {
-                op_b = "-(" + op_b + ')';
-            }
-
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
+                // MOV does not have neither 'abs' nor 'neg' bits.
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 break;
             }
@@ -926,6 +963,8 @@ private:
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
+                // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
+                op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
@@ -933,11 +972,14 @@ private:
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
                                         instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
@@ -977,6 +1019,9 @@ private:
             case OpCode::Id::FMNMX_C:
             case OpCode::Id::FMNMX_R:
             case OpCode::Id::FMNMX_IMM: {
+                op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a);
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
+
                 std::string condition =
                     GetPredicateCondition(instr.alu.fmnmx.pred, instr.alu.fmnmx.negate_pred != 0);
                 std::string parameters = op_a + ',' + op_b;
@@ -990,7 +1035,7 @@ private:
             case OpCode::Id::RRO_R:
             case OpCode::Id::RRO_IMM: {
                 // Currently RRO is only implemented as a register move.
-                // Usage of `abs_b` and `negate_b` here should also be correct.
+                op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b);
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
                 LOG_WARNING(HW_GPU, "RRO instruction is incomplete");
                 break;
@@ -1267,8 +1312,6 @@ private:
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
@@ -1981,13 +2024,13 @@ private:
                 ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported");
 
                 u32 target = offset + instr.bra.GetBranchTarget();
-                shader.AddLine("ssy_target = " + std::to_string(target) + "u;");
+                EmitPushToSSYStack(target);
                 break;
             }
             case OpCode::Id::SYNC: {
                 // The SYNC opcode jumps to the address previously set by the SSY opcode
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
-                shader.AddLine("{ jmp_to = ssy_target; break; }");
+                EmitPopFromSSYStack();
                 break;
             }
             case OpCode::Id::DEPBAR: {
@@ -2058,7 +2101,13 @@ private:
             } else {
                 labels.insert(subroutine.begin);
                 shader.AddLine("uint jmp_to = " + std::to_string(subroutine.begin) + "u;");
-                shader.AddLine("uint ssy_target = 0u;");
+
+                // TODO(Subv): Figure out the actual depth of the SSY stack, for now it seems
+                // unlikely that shaders will use 20 nested SSYs.
+                constexpr u32 SSY_STACK_SIZE = 20;
+                shader.AddLine("uint ssy_stack[" + std::to_string(SSY_STACK_SIZE) + "];");
+                shader.AddLine("uint ssy_stack_top = 0u;");
+
                 shader.AddLine("while (true) {");
                 ++shader.scope;
 
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 8f719fdd8..5d91a0c2f 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -147,6 +147,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
         // manually mix them. However the shader part of this is not yet implemented.
         return GL_CLAMP_TO_BORDER;
+    case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
+        return GL_MIRROR_CLAMP_TO_EDGE;
     }
     LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
     UNREACHABLE();