21 files changed, 313 insertions, 167 deletions
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 98a8b5337..7ff44f06d 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -29,8 +29,8 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
 }
 
 void Fermi2D::HandleSurfaceCopy() {
-    LOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
-                static_cast<u32>(regs.operation));
+    LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
+              static_cast<u32>(regs.operation));
 
     // TODO(Subv): Only raw copies are implemented.
     ASSERT(regs.operation == Operation::SrcCopy);
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index e9c15beff..b318aedb8 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -639,7 +639,7 @@ void Maxwell3D::ProcessSyncPoint() {
 }
 
 void Maxwell3D::DrawArrays() {
-    LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
               regs.vertex_buffer.count);
     ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 052e6d24e..28272ef6f 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 {
     Exch = 8,
 };
 
+enum class ShuffleOperation : u64 {
+    Idx = 0,  // shuffleNV
+    Up = 1,   // shuffleUpNV
+    Down = 2, // shuffleDownNV
+    Bfly = 3, // shuffleXorNV
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -600,6 +607,15 @@ union Instruction {
     } vote;
 
     union {
+        BitField<30, 2, ShuffleOperation> operation;
+        BitField<48, 3, u64> pred48;
+        BitField<28, 1, u64> is_index_imm;
+        BitField<29, 1, u64> is_mask_imm;
+        BitField<20, 5, u64> index_imm;
+        BitField<34, 13, u64> mask_imm;
+    } shfl;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -934,6 +950,11 @@ union Instruction {
     } isetp;
 
     union {
+        BitField<48, 1, u64> is_signed;
+        BitField<49, 3, PredCondition> cond;
+    } icmp;
+
+    union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
         BitField<12, 3, u64> pred12;
@@ -1542,6 +1563,7 @@ public:
         BRK,
         DEPBAR,
         VOTE,
+        SHFL,
         BFE_C,
         BFE_R,
         BFE_IMM,
@@ -1628,6 +1650,10 @@ public:
         SEL_C,
         SEL_R,
         SEL_IMM,
+        ICMP_RC,
+        ICMP_R,
+        ICMP_CR,
+        ICMP_IMM,
         MUFU,  // Multi-Function Operator
         RRO_C, // Range Reduction Operator
         RRO_R,
@@ -1833,6 +1859,7 @@ private:
             INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
             INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
             INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
+            INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
             INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
@@ -1892,6 +1919,10 @@ private:
             INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
             INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
             INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
+            INST("010100110100----", Id::ICMP_RC, Type::ArithmeticInteger, "ICMP_RC"),
+            INST("010110110100----", Id::ICMP_R, Type::ArithmeticInteger, "ICMP_R"),
+            INST("010010110100----", Id::ICMP_CR, Type::ArithmeticInteger, "ICMP_CR"),
+            INST("0011011-0100----", Id::ICMP_IMM, Type::ArithmeticInteger, "ICMP_IMM"),
             INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"),
             INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"),
             INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 2c47541cb..76cfe8107 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -122,6 +122,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RGBA16_UINT:
     case RenderTargetFormat::RGBA16_UNORM:
     case RenderTargetFormat::RGBA16_FLOAT:
+    case RenderTargetFormat::RGBX16_FLOAT:
     case RenderTargetFormat::RG32_FLOAT:
     case RenderTargetFormat::RG32_UINT:
         return 8;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 78bc0601a..29fa8e95b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -42,6 +42,7 @@ enum class RenderTargetFormat : u32 {
     RGBA16_FLOAT = 0xCA,
     RG32_FLOAT = 0xCB,
     RG32_UINT = 0xCD,
+    RGBX16_FLOAT = 0xCE,
     BGRA8_UNORM = 0xCF,
     BGRA8_SRGB = 0xD0,
     RGB10_A2_UNORM = 0xD1,
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index 084f85e67..ab71870ab 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -83,6 +83,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
     MortonCopy<true, PixelFormat::RG8U>,
     MortonCopy<true, PixelFormat::RG8S>,
     MortonCopy<true, PixelFormat::RG32UI>,
+    MortonCopy<true, PixelFormat::RGBX16F>,
     MortonCopy<true, PixelFormat::R32UI>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
@@ -151,6 +152,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
     MortonCopy<false, PixelFormat::RG8U>,
     MortonCopy<false, PixelFormat::RG8S>,
     MortonCopy<false, PixelFormat::RG32UI>,
+    MortonCopy<false, PixelFormat::RGBX16F>,
     MortonCopy<false, PixelFormat::R32UI>,
     nullptr,
     nullptr,
diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
index 7c926bd48..a5d69d78d 100644
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
@@ -35,21 +35,16 @@ OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheK
     local_state.draw.draw_framebuffer = framebuffer.handle;
     local_state.ApplyFramebufferState();
 
-    if (key.is_single_buffer) {
-        if (key.color_attachments[0] != GL_NONE && key.colors[0]) {
-            key.colors[0]->Attach(key.color_attachments[0], GL_DRAW_FRAMEBUFFER);
-            glDrawBuffer(key.color_attachments[0]);
-        } else {
-            glDrawBuffer(GL_NONE);
-        }
-    } else {
-        for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
-            if (key.colors[index]) {
-                key.colors[index]->Attach(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index),
-                                          GL_DRAW_FRAMEBUFFER);
-            }
+    for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+        if (key.colors[index]) {
+            key.colors[index]->Attach(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index),
+                                      GL_DRAW_FRAMEBUFFER);
         }
+    }
+    if (key.colors_count) {
         glDrawBuffers(key.colors_count, key.color_attachments.data());
+    } else {
+        glDrawBuffer(GL_NONE);
     }
 
     if (key.zeta) {
@@ -67,9 +62,9 @@ std::size_t FramebufferCacheKey::Hash() const {
 }
 
 bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const {
-    return std::tie(is_single_buffer, stencil_enable, colors_count, color_attachments, colors,
-                    zeta) == std::tie(rhs.is_single_buffer, rhs.stencil_enable, rhs.colors_count,
-                                      rhs.color_attachments, rhs.colors, rhs.zeta);
+    return std::tie(stencil_enable, colors_count, color_attachments, colors, zeta) ==
+           std::tie(rhs.stencil_enable, rhs.colors_count, rhs.color_attachments, rhs.colors,
+                    rhs.zeta);
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h
index a3a996353..424344c48 100644
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h
@@ -19,7 +19,6 @@
 namespace OpenGL {
 
 struct alignas(sizeof(u64)) FramebufferCacheKey {
-    bool is_single_buffer = false;
     bool stencil_enable = false;
     u16 colors_count = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 246b892c5..6a17bed72 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -382,99 +382,51 @@ void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
     shader_cache.LoadDiskCache(stop_loading, callback);
 }
 
-std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
-    OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool preserve_contents,
-    std::optional<std::size_t> single_color_target) {
+void RasterizerOpenGL::ConfigureFramebuffers() {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
     auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-
-    const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
-                                                 single_color_target};
-    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
-        // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
-        // single color targets). This is done because the guest registers may not change but the
-        // host framebuffer may contain different attachments
-        return current_depth_stencil_usage;
+    if (!gpu.dirty.render_settings) {
+        return;
     }
     gpu.dirty.render_settings = false;
-    current_framebuffer_config_state = fb_config_state;
 
     texture_cache.GuardRenderTargets(true);
 
-    View depth_surface{};
-    if (using_depth_fb) {
-        depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents);
-    } else {
-        texture_cache.SetEmptyDepthBuffer();
-    }
+    View depth_surface = texture_cache.GetDepthBufferSurface(true);
 
+    const auto& regs = gpu.regs;
+    state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0;
     UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
 
     // Bind the framebuffer surfaces
-    current_state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0;
-
     FramebufferCacheKey fbkey;
+    for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+        View color_surface{texture_cache.GetColorBufferSurface(index, true)};
 
-    if (using_color_fb) {
-        if (single_color_target) {
-            // Used when just a single color attachment is enabled, e.g. for clearing a color buffer
-            View color_surface{
-                texture_cache.GetColorBufferSurface(*single_color_target, preserve_contents)};
-
-            if (color_surface) {
-                // Assume that a surface will be written to if it is used as a framebuffer, even if
-                // the shader doesn't actually write to it.
-                texture_cache.MarkColorBufferInUse(*single_color_target);
-            }
-
-            fbkey.is_single_buffer = true;
-            fbkey.color_attachments[0] =
-                GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target);
-            fbkey.colors[0] = color_surface;
-            for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
-                if (index != *single_color_target) {
-                    texture_cache.SetEmptyColorBuffer(index);
-                }
-            }
-        } else {
-            // Multiple color attachments are enabled
-            for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
-                View color_surface{texture_cache.GetColorBufferSurface(index, preserve_contents)};
-
-                if (color_surface) {
-                    // Assume that a surface will be written to if it is used as a framebuffer, even
-                    // if the shader doesn't actually write to it.
-                    texture_cache.MarkColorBufferInUse(index);
-                }
-
-                fbkey.color_attachments[index] =
-                    GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index);
-                fbkey.colors[index] = color_surface;
-            }
-            fbkey.is_single_buffer = false;
-            fbkey.colors_count = regs.rt_control.count;
+        if (color_surface) {
+            // Assume that a surface will be written to if it is used as a framebuffer, even
+            // if the shader doesn't actually write to it.
+            texture_cache.MarkColorBufferInUse(index);
         }
-    } else {
-        // No color attachments are enabled - leave them as zero
-        fbkey.is_single_buffer = true;
+
+        fbkey.color_attachments[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index);
+        fbkey.colors[index] = std::move(color_surface);
     }
+    fbkey.colors_count = regs.rt_control.count;
 
     if (depth_surface) {
         // Assume that a surface will be written to if it is used as a framebuffer, even if
         // the shader doesn't actually write to it.
         texture_cache.MarkDepthBufferInUse();
 
-        fbkey.zeta = depth_surface;
         fbkey.stencil_enable = depth_surface->GetSurfaceParams().type == SurfaceType::DepthStencil;
+        fbkey.zeta = std::move(depth_surface);
     }
 
     texture_cache.GuardRenderTargets(false);
 
-    current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(fbkey);
-    SyncViewport(current_state);
-
-    return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
+    state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(fbkey);
+    SyncViewport(state);
 }
 
 void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
@@ -684,7 +636,7 @@ void RasterizerOpenGL::DrawPrelude() {
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
-    ConfigureFramebuffers(state);
+    ConfigureFramebuffers();
 
     // Signal the buffer cache that we are not going to upload more things.
     const bool invalidate = buffer_cache.Unmap();
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 682f0becc..9c10ebda3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -77,39 +77,8 @@ public:
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
 private:
-    struct FramebufferConfigState {
-        bool using_color_fb{};
-        bool using_depth_fb{};
-        bool preserve_contents{};
-        std::optional<std::size_t> single_color_target;
-
-        bool operator==(const FramebufferConfigState& rhs) const {
-            return std::tie(using_color_fb, using_depth_fb, preserve_contents,
-                            single_color_target) == std::tie(rhs.using_color_fb, rhs.using_depth_fb,
-                                                             rhs.preserve_contents,
-                                                             rhs.single_color_target);
-        }
-        bool operator!=(const FramebufferConfigState& rhs) const {
-            return !operator==(rhs);
-        }
-    };
-
-    /**
-     * Configures the color and depth framebuffer states.
-     *
-     * @param current_state       The current OpenGL state.
-     * @param using_color_fb      If true, configure color framebuffers.
-     * @param using_depth_fb      If true, configure the depth/stencil framebuffer.
-     * @param preserve_contents   If true, tries to preserve data from a previously used
-     *                            framebuffer.
-     * @param single_color_target Specifies if a single color buffer target should be used.
-     *
-     * @returns If depth (first) or stencil (second) are being stored in the bound zeta texture
-     *          (requires using_depth_fb to be true)
-     */
-    std::pair<bool, bool> ConfigureFramebuffers(
-        OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
-        bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
+    /// Configures the color and depth framebuffer states.
+    void ConfigureFramebuffers();
 
     void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
                                    bool using_depth_fb, bool using_stencil_fb);
@@ -231,9 +200,6 @@ private:
              OGLVertexArray>
         vertex_array_cache;
 
-    FramebufferConfigState current_framebuffer_config_state;
-    std::pair<bool, bool> current_depth_stencil_usage{};
-
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 909ccb82c..0dbc4c02f 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     std::string source = "#version 430 core\n"
                          "#extension GL_ARB_separate_shader_objects : enable\n"
                          "#extension GL_NV_gpu_shader5 : enable\n"
-                         "#extension GL_NV_shader_thread_group : enable\n";
+                         "#extension GL_NV_shader_thread_group : enable\n"
+                         "#extension GL_NV_shader_thread_shuffle : enable\n";
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f7e86ab26..74cb59bc1 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1029,10 +1029,10 @@ private:
         return {std::move(temporary), value.GetType()};
     }
 
-    Expression GetOutputAttribute(const AbufNode* abuf) {
+    std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) {
         switch (const auto attribute = abuf->GetIndex()) {
         case Attribute::Index::Position:
-            return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float};
+            return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}};
         case Attribute::Index::LayerViewportPointSize:
             switch (abuf->GetElement()) {
             case 0:
@@ -1042,25 +1042,25 @@ private:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return {"gl_Layer", Type::Int};
+                return {{"gl_Layer", Type::Int}};
             case 2:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return {"gl_ViewportIndex", Type::Int};
+                return {{"gl_ViewportIndex", Type::Int}};
             case 3:
                 UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
-                return {"gl_PointSize", Type::Float};
+                return {{"gl_PointSize", Type::Float}};
             }
             return {};
         case Attribute::Index::ClipDistances0123:
-            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float};
+            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}};
         case Attribute::Index::ClipDistances4567:
-            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float};
+            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
         default:
             if (IsGenericAttribute(attribute)) {
-                return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()),
-                        Type::Float};
+                return {
+                    {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
             return {};
@@ -1300,7 +1300,11 @@ private:
             target = {GetRegister(gpr->GetIndex()), Type::Float};
         } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-            target = GetOutputAttribute(abuf);
+            auto output = GetOutputAttribute(abuf);
+            if (!output) {
+                return {};
+            }
+            target = std::move(*output);
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             if (stage == ProgramType::Compute) {
                 LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
@@ -1961,8 +1965,7 @@ private:
     Expression BallotThread(Operation operation) {
         const std::string value = VisitOperand(operation, 0).AsBool();
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia warp intrinsics are not available and its required by a shader");
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
             // Stub on non-Nvidia devices by simulating all threads voting the same as the active
             // one.
             return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint};
@@ -1973,8 +1976,7 @@ private:
     Expression Vote(Operation operation, const char* func) {
         const std::string value = VisitOperand(operation, 0).AsBool();
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia vote intrinsics are not available and its required by a shader");
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
             // Stub with a warp size of one.
             return {value, Type::Bool};
         }
@@ -1991,15 +1993,54 @@ private:
 
     Expression VoteEqual(Operation operation) {
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia vote intrinsics are not available and its required by a shader");
-            // We must return true here since a stub for a theoretical warp size of 1 will always
-            // return an equal result for all its votes.
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
+            // We must return true here since a stub for a theoretical warp size of 1.
+            // This will always return an equal result across all votes.
             return {"true", Type::Bool};
         }
         return Vote(operation, "allThreadsEqualNV");
     }
 
+    template <const std::string_view& func>
+    Expression Shuffle(Operation operation) {
+        const std::string value = VisitOperand(operation, 0).AsFloat();
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader");
+            // On a "single-thread" device we are either on the same thread or out of bounds. Both
+            // cases return the passed value.
+            return {value, Type::Float};
+        }
+
+        const std::string index = VisitOperand(operation, 1).AsUint();
+        const std::string width = VisitOperand(operation, 2).AsUint();
+        return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float};
+    }
+
+    template <const std::string_view& func>
+    Expression InRangeShuffle(Operation operation) {
+        const std::string index = VisitOperand(operation, 0).AsUint();
+        const std::string width = VisitOperand(operation, 1).AsUint();
+        if (!device.HasWarpIntrinsics()) {
+            // On a "single-thread" device we are only in bounds when the requested index is 0.
+            return {fmt::format("({} == 0U)", index), Type::Bool};
+        }
+
+        const std::string in_range = code.GenerateTemporary();
+        code.AddLine("bool {};", in_range);
+        code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range);
+        return {in_range, Type::Bool};
+    }
+
+    struct Func final {
+        Func() = delete;
+        ~Func() = delete;
+
+        static constexpr std::string_view ShuffleIndexed = "shuffleNV";
+        static constexpr std::string_view ShuffleUp = "shuffleUpNV";
+        static constexpr std::string_view ShuffleDown = "shuffleDownNV";
+        static constexpr std::string_view ShuffleButterfly = "shuffleXorNV";
+    };
+
     static constexpr std::array operation_decompilers = {
         &GLSLDecompiler::Assign,
 
@@ -2162,6 +2203,16 @@ private:
         &GLSLDecompiler::VoteAll,
         &GLSLDecompiler::VoteAny,
         &GLSLDecompiler::VoteEqual,
+
+        &GLSLDecompiler::Shuffle<Func::ShuffleIndexed>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleUp>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleDown>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleButterfly>,
+
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleIndexed>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleUp>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleDown>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleButterfly>,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 4f135fe03..173b76c4e 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -97,6 +97,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
     {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},            // RG8U
     {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                     // RG8S
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},   // RG32UI
+    {GL_RGB16F, GL_RGBA16, GL_HALF_FLOAT, ComponentType::Float, false},        // RGBX16F
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},   // R32UI
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_8X8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_8X5
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ea77dd211..9ed738171 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -145,7 +145,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
         case Tegra::Texture::TextureMipmapFilter::None:
             return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
             return GL_LINEAR_MIPMAP_LINEAR;
         }
@@ -157,7 +157,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
         case Tegra::Texture::TextureMipmapFilter::Nearest:
             return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
     }
     }
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 0bbbf6851..3c5acda3e 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -143,6 +143,7 @@ static constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // RG8U
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // RG8S
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // RG32UI
+    {vk::Format::eUndefined, ComponentType::Invalid, false},           // RGBX16F
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // R32UI
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // ASTC_2D_8X8
     {vk::Format::eUndefined, ComponentType::Invalid, false},           // ASTC_2D_8X5
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index b9153934e..f7fbbb6e4 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1127,6 +1127,46 @@ private:
         return {};
     }
 
+    Id ShuffleIndexed(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleUp(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleDown(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleButterfly(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleIndexed(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleUp(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleDown(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleButterfly(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                       const std::string& name) {
         const Id id = OpVariable(type, storage);
@@ -1431,6 +1471,16 @@ private:
         &SPIRVDecompiler::VoteAll,
         &SPIRVDecompiler::VoteAny,
         &SPIRVDecompiler::VoteEqual,
+
+        &SPIRVDecompiler::ShuffleIndexed,
+        &SPIRVDecompiler::ShuffleUp,
+        &SPIRVDecompiler::ShuffleDown,
+        &SPIRVDecompiler::ShuffleButterfly,
+
+        &SPIRVDecompiler::InRangeShuffleIndexed,
+        &SPIRVDecompiler::InRangeShuffleUp,
+        &SPIRVDecompiler::InRangeShuffleDown,
+        &SPIRVDecompiler::InRangeShuffleButterfly,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index c8c1a7f40..b73f6536e 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -138,6 +138,35 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
         SetRegister(bb, instr.gpr0, value);
         break;
     }
+    case OpCode::Id::ICMP_CR:
+    case OpCode::Id::ICMP_R:
+    case OpCode::Id::ICMP_RC:
+    case OpCode::Id::ICMP_IMM: {
+        const Node zero = Immediate(0);
+
+        const auto [op_b, test] = [&]() -> std::pair<Node, Node> {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::ICMP_CR:
+                return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                        GetRegister(instr.gpr39)};
+            case OpCode::Id::ICMP_R:
+                return {GetRegister(instr.gpr20), GetRegister(instr.gpr39)};
+            case OpCode::Id::ICMP_RC:
+                return {GetRegister(instr.gpr39),
+                        GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset)};
+            case OpCode::Id::ICMP_IMM:
+                return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)};
+            default:
+                UNREACHABLE();
+                return {zero, zero};
+            }
+        }();
+        const Node op_a = GetRegister(instr.gpr8);
+        const Node comparison =
+            GetPredicateComparisonInteger(instr.icmp.cond, instr.icmp.is_signed != 0, test, zero);
+        SetRegister(bb, instr.gpr0, Operation(OperationCode::Select, comparison, op_a, op_b));
+        break;
+    }
     case OpCode::Id::LOP_C:
     case OpCode::Id::LOP_R:
     case OpCode::Id::LOP_IMM: {
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
index 04ca74f46..a8e481b3c 100644
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@@ -13,6 +13,7 @@ namespace VideoCommon::Shader {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Pred;
+using Tegra::Shader::ShuffleOperation;
 using Tegra::Shader::VoteOperation;
 
 namespace {
@@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
         SetPredicate(bb, instr.vote.dest_pred, vote);
         break;
     }
+    case OpCode::Id::SHFL: {
+        Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm))
+                                           : GetRegister(instr.gpr39);
+        Node width = [&] {
+            // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has
+            // been done reversing Nvidia's math. It won't work on all cases due to SHFL having
+            // different parameters that don't properly map to GLSL's interface, but it should work
+            // for cases emitted by Nvidia's compiler.
+            if (instr.shfl.operation == ShuffleOperation::Up) {
+                return Operation(
+                    OperationCode::ILogicalShiftRight,
+                    Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)),
+                    Immediate(8));
+            } else {
+                return Operation(OperationCode::ILogicalShiftRight,
+                                 Operation(OperationCode::IAdd, Immediate(0x201F),
+                                           Operation(OperationCode::INegate, std::move(mask))),
+                                 Immediate(8));
+            }
+        }();
+
+        const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> {
+            switch (instr.shfl.operation) {
+            case ShuffleOperation::Idx:
+                return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed};
+            case ShuffleOperation::Up:
+                return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp};
+            case ShuffleOperation::Down:
+                return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown};
+            case ShuffleOperation::Bfly:
+                return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly};
+            }
+            UNREACHABLE_MSG("Invalid SHFL operation: {}",
+                            static_cast<u64>(instr.shfl.operation.Value()));
+            return {};
+        }();
+
+        // Setting the predicate before the register is intentional to avoid overwriting.
+        Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm))
+                                             : GetRegister(instr.gpr20);
+        SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width));
+        SetRegister(
+            bb, instr.gpr0,
+            Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width)));
+        break;
+    }
     default:
         UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
         break;
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 425111cc4..abf2cb1ab 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -181,6 +181,16 @@ enum class OperationCode {
     VoteAny,      /// (bool) -> bool
     VoteEqual,    /// (bool) -> bool
 
+    ShuffleIndexed,   /// (uint value, uint index, uint width) -> uint
+    ShuffleUp,        /// (uint value, uint index, uint width) -> uint
+    ShuffleDown,      /// (uint value, uint index, uint width) -> uint
+    ShuffleButterfly, /// (uint value, uint index, uint width) -> uint
+
+    InRangeShuffleIndexed,   /// (uint index, uint width) -> bool
+    InRangeShuffleUp,        /// (uint index, uint width) -> bool
+    InRangeShuffleDown,      /// (uint index, uint width) -> bool
+    InRangeShuffleButterfly, /// (uint index, uint width) -> bool
+
     Amount,
 };
 
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 53d0142cb..250afc6d6 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -159,6 +159,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
         return PixelFormat::R32UI;
     case Tegra::RenderTargetFormat::RG32_UINT:
         return PixelFormat::RG32UI;
+    case Tegra::RenderTargetFormat::RGBX16_FLOAT:
+        return PixelFormat::RGBX16F;
     default:
         LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
         UNREACHABLE();
@@ -415,6 +417,7 @@ ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format) {
     case Tegra::RenderTargetFormat::RG8_SNORM:
         return ComponentType::SNorm;
     case Tegra::RenderTargetFormat::RGBA16_FLOAT:
+    case Tegra::RenderTargetFormat::RGBX16_FLOAT:
     case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
     case Tegra::RenderTargetFormat::RGBA32_FLOAT:
     case Tegra::RenderTargetFormat::RG32_FLOAT:
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index 19268b7cd..1e1c432a5 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -57,36 +57,37 @@ enum class PixelFormat {
     RG8U = 39,
     RG8S = 40,
     RG32UI = 41,
-    R32UI = 42,
-    ASTC_2D_8X8 = 43,
-    ASTC_2D_8X5 = 44,
-    ASTC_2D_5X4 = 45,
-    BGRA8_SRGB = 46,
-    DXT1_SRGB = 47,
-    DXT23_SRGB = 48,
-    DXT45_SRGB = 49,
-    BC7U_SRGB = 50,
-    ASTC_2D_4X4_SRGB = 51,
-    ASTC_2D_8X8_SRGB = 52,
-    ASTC_2D_8X5_SRGB = 53,
-    ASTC_2D_5X4_SRGB = 54,
-    ASTC_2D_5X5 = 55,
-    ASTC_2D_5X5_SRGB = 56,
-    ASTC_2D_10X8 = 57,
-    ASTC_2D_10X8_SRGB = 58,
+    RGBX16F = 42,
+    R32UI = 43,
+    ASTC_2D_8X8 = 44,
+    ASTC_2D_8X5 = 45,
+    ASTC_2D_5X4 = 46,
+    BGRA8_SRGB = 47,
+    DXT1_SRGB = 48,
+    DXT23_SRGB = 49,
+    DXT45_SRGB = 50,
+    BC7U_SRGB = 51,
+    ASTC_2D_4X4_SRGB = 52,
+    ASTC_2D_8X8_SRGB = 53,
+    ASTC_2D_8X5_SRGB = 54,
+    ASTC_2D_5X4_SRGB = 55,
+    ASTC_2D_5X5 = 56,
+    ASTC_2D_5X5_SRGB = 57,
+    ASTC_2D_10X8 = 58,
+    ASTC_2D_10X8_SRGB = 59,
 
     MaxColorFormat,
 
     // Depth formats
-    Z32F = 59,
-    Z16 = 60,
+    Z32F = 60,
+    Z16 = 61,
 
     MaxDepthFormat,
 
     // DepthStencil formats
-    Z24S8 = 61,
-    S8Z24 = 62,
-    Z32FS8 = 63,
+    Z24S8 = 62,
+    S8Z24 = 63,
+    Z32FS8 = 64,
 
     MaxDepthStencilFormat,
 
@@ -166,6 +167,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{
     0, // RG8U
     0, // RG8S
     0, // RG32UI
+    0, // RGBX16F
     0, // R32UI
     2, // ASTC_2D_8X8
     2, // ASTC_2D_8X5
@@ -249,6 +251,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{
     1,  // RG8U
     1,  // RG8S
     1,  // RG32UI
+    1,  // RGBX16F
     1,  // R32UI
     8,  // ASTC_2D_8X8
     8,  // ASTC_2D_8X5
@@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{
     1, // RG8U
     1, // RG8S
     1, // RG32UI
+    1, // RGBX16F
     1, // R32UI
     8, // ASTC_2D_8X8
     5, // ASTC_2D_8X5
@@ -399,6 +403,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
     16,  // RG8U
     16,  // RG8S
     64,  // RG32UI
+    64,  // RGBX16F
     32,  // R32UI
     128, // ASTC_2D_8X8
     128, // ASTC_2D_8X5
@@ -489,6 +494,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table
     SurfaceCompression::None,       // RG8U
     SurfaceCompression::None,       // RG8S
     SurfaceCompression::None,       // RG32UI
+    SurfaceCompression::None,       // RGBX16F
     SurfaceCompression::None,       // R32UI
     SurfaceCompression::Converted,  // ASTC_2D_8X8
     SurfaceCompression::Converted,  // ASTC_2D_8X5