18 files changed, 284 insertions, 371 deletions
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 25652e794..48b86f3bd 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -71,16 +71,6 @@ GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t s
     return uploaded_offset;
 }
 
-std::tuple<u8*, GLintptr> OGLBufferCache::ReserveMemory(std::size_t size, std::size_t alignment) {
-    AlignBuffer(alignment);
-    u8* const uploaded_ptr = buffer_ptr;
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return std::make_tuple(uploaded_ptr, uploaded_offset);
-}
-
 bool OGLBufferCache::Map(std::size_t max_size) {
     bool invalidate;
     std::tie(buffer_ptr, buffer_offset_base, invalidate) =
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index f9247a40e..f2347581b 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -61,9 +61,6 @@ public:
     /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
     GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
 
-    /// Reserves memory to be used by host's CPU. Returns mapped address and offset.
-    std::tuple<u8*, GLintptr> ReserveMemory(std::size_t size, std::size_t alignment = 4);
-
     bool Map(std::size_t max_size);
     void Unmap();
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 1d1581f49..65a88b06c 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -2,11 +2,14 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
 #include <cstddef>
 #include <glad/glad.h>
 
 #include "common/logging/log.h"
+#include "common/scope_exit.h"
 #include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
@@ -24,6 +27,7 @@ Device::Device() {
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
     has_variable_aoffi = TestVariableAoffi();
+    has_component_indexing_bug = TestComponentIndexingBug();
 }
 
 Device::Device(std::nullptr_t) {
@@ -31,6 +35,7 @@ Device::Device(std::nullptr_t) {
     max_vertex_attributes = 16;
     max_varyings = 15;
     has_variable_aoffi = true;
+    has_component_indexing_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
@@ -52,4 +57,53 @@ void main() {
     return supported;
 }
 
+bool Device::TestComponentIndexingBug() {
+    constexpr char log_message[] = "Renderer_ComponentIndexingBug: {}";
+    const GLchar* COMPONENT_TEST = R"(#version 430 core
+layout (std430, binding = 0) buffer OutputBuffer {
+    uint output_value;
+};
+layout (std140, binding = 0) uniform InputBuffer {
+    uvec4 input_value[4096];
+};
+layout (location = 0) uniform uint idx;
+void main() {
+    output_value = input_value[idx >> 2][idx & 3];
+})";
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &COMPONENT_TEST)};
+    SCOPE_EXIT({ glDeleteProgram(shader); });
+    glUseProgram(shader);
+
+    OGLVertexArray vao;
+    vao.Create();
+    glBindVertexArray(vao.handle);
+
+    constexpr std::array<GLuint, 8> values{0, 0, 0, 0, 0x1236327, 0x985482, 0x872753, 0x2378432};
+    OGLBuffer ubo;
+    ubo.Create();
+    glNamedBufferData(ubo.handle, sizeof(values), values.data(), GL_STATIC_DRAW);
+    glBindBufferBase(GL_UNIFORM_BUFFER, 0, ubo.handle);
+
+    OGLBuffer ssbo;
+    ssbo.Create();
+    glNamedBufferStorage(ssbo.handle, sizeof(GLuint), nullptr, GL_CLIENT_STORAGE_BIT);
+
+    for (GLuint index = 4; index < 8; ++index) {
+        glInvalidateBufferData(ssbo.handle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo.handle);
+
+        glProgramUniform1ui(shader, 0, index);
+        glDrawArrays(GL_POINTS, 0, 1);
+
+        GLuint result;
+        glGetNamedBufferSubData(ssbo.handle, 0, sizeof(result), &result);
+        if (result != values.at(index)) {
+            LOG_INFO(Render_OpenGL, log_message, true);
+            return true;
+        }
+    }
+    LOG_INFO(Render_OpenGL, log_message, false);
+    return false;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index de8490682..8c8c93760 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,13 +30,19 @@ public:
         return has_variable_aoffi;
     }
 
+    bool HasComponentIndexingBug() const {
+        return has_component_indexing_bug;
+    }
+
 private:
     static bool TestVariableAoffi();
+    static bool TestComponentIndexingBug();
 
     std::size_t uniform_buffer_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
     bool has_variable_aoffi{};
+    bool has_component_indexing_bug{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
deleted file mode 100644
index c3e94d917..000000000
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <array>
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_buffer_cache.h"
-#include "video_core/renderer_opengl/gl_primitive_assembler.h"
-
-namespace OpenGL {
-
-constexpr u32 TRIANGLES_PER_QUAD = 6;
-constexpr std::array<u32, TRIANGLES_PER_QUAD> QUAD_MAP = {0, 1, 2, 0, 2, 3};
-
-PrimitiveAssembler::PrimitiveAssembler(OGLBufferCache& buffer_cache) : buffer_cache(buffer_cache) {}
-
-PrimitiveAssembler::~PrimitiveAssembler() = default;
-
-std::size_t PrimitiveAssembler::CalculateQuadSize(u32 count) const {
-    ASSERT_MSG(count % 4 == 0, "Quad count is expected to be a multiple of 4");
-    return (count / 4) * TRIANGLES_PER_QUAD * sizeof(GLuint);
-}
-
-GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) {
-    const std::size_t size{CalculateQuadSize(count)};
-    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(size);
-
-    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
-        for (u32 i = 0; i < TRIANGLES_PER_QUAD; ++i) {
-            const u32 index = first + primitive * 4 + QUAD_MAP[i];
-            std::memcpy(dst_pointer, &index, sizeof(index));
-            dst_pointer += sizeof(index);
-        }
-    }
-
-    return index_offset;
-}
-
-GLintptr PrimitiveAssembler::MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count) {
-    const std::size_t map_size{CalculateQuadSize(count)};
-    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size);
-
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const u8* source{memory_manager.GetPointer(gpu_addr)};
-
-    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
-        for (std::size_t i = 0; i < TRIANGLES_PER_QUAD; ++i) {
-            const u32 index = primitive * 4 + QUAD_MAP[i];
-            const u8* src_offset = source + (index * index_size);
-
-            std::memcpy(dst_pointer, src_offset, index_size);
-            dst_pointer += index_size;
-        }
-    }
-
-    return index_offset;
-}
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h
deleted file mode 100644
index 4e87ce4d6..000000000
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <glad/glad.h>
-
-#include "common/common_types.h"
-
-namespace OpenGL {
-
-class OGLBufferCache;
-
-class PrimitiveAssembler {
-public:
-    explicit PrimitiveAssembler(OGLBufferCache& buffer_cache);
-    ~PrimitiveAssembler();
-
-    /// Calculates the size required by MakeQuadArray and MakeQuadIndexed.
-    std::size_t CalculateQuadSize(u32 count) const;
-
-    GLintptr MakeQuadArray(u32 first, u32 count);
-
-    GLintptr MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count);
-
-private:
-    OGLBufferCache& buffer_cache;
-};
-
-} // namespace OpenGL
-\ No newline at end of file
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f9b6dfeea..d77426067 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -246,29 +246,6 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
     DrawParameters params{};
     params.current_instance = gpu.state.current_instance;
 
-    if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
-        MICROPROFILE_SCOPE(OpenGL_PrimitiveAssembly);
-
-        params.use_indexed = true;
-        params.primitive_mode = GL_TRIANGLES;
-
-        if (is_indexed) {
-            params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
-            params.count = (regs.index_array.count / 4) * 6;
-            params.index_buffer_offset = primitive_assembler.MakeQuadIndexed(
-                regs.index_array.IndexStart(), regs.index_array.FormatSizeInBytes(),
-                regs.index_array.count);
-            params.base_vertex = static_cast<GLint>(regs.vb_element_base);
-        } else {
-            // MakeQuadArray always generates u32 indexes
-            params.index_format = GL_UNSIGNED_INT;
-            params.count = (regs.vertex_buffer.count / 4) * 6;
-            params.index_buffer_offset = primitive_assembler.MakeQuadArray(
-                regs.vertex_buffer.first, regs.vertex_buffer.count);
-        }
-        return params;
-    }
-
     params.use_indexed = is_indexed;
     params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
 
@@ -345,9 +322,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         }
 
         const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
-        SetupConstBuffers(stage_enum, shader, program_handle, base_bindings);
-        SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings);
-        SetupTextures(stage_enum, shader, program_handle, base_bindings);
+        SetupDrawConstBuffers(stage_enum, shader);
+        SetupGlobalRegions(stage_enum, shader);
+        SetupTextures(stage_enum, shader, base_bindings);
 
         // Workaround for Intel drivers.
         // When a clip distance is enabled but not set in the shader it crops parts of the screen
@@ -686,30 +663,19 @@ void RasterizerOpenGL::DrawArrays() {
     SyncCullMode();
     SyncPrimitiveRestart();
     SyncScissorTest(state);
-    // Alpha Testing is synced on shaders.
     SyncTransformFeedback();
     SyncPointState();
-    CheckAlphaTests();
     SyncPolygonOffset();
-    // TODO(bunnei): Sync framebuffer_scale uniform here
-    // TODO(bunnei): Sync scissorbox uniform(s) here
+    SyncAlphaTest();
 
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
 
     std::size_t buffer_size = CalculateVertexArraysSize();
 
-    // Add space for index buffer (keeping in mind non-core primitives)
-    switch (regs.draw.topology) {
-    case Maxwell::PrimitiveTopology::Quads:
-        buffer_size = Common::AlignUp(buffer_size, 4) +
-                      primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
-        break;
-    default:
-        if (is_indexed) {
-            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
-        }
-        break;
+    // Add space for index buffer
+    if (is_indexed) {
+        buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
     }
 
     // Uniform space for the 5 shader stages
@@ -810,57 +776,55 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                         const Shader& shader, GLuint program_handle,
-                                         BaseBindings base_bindings) {
+void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& gpu = system.GPU();
-    const auto& maxwell3d = gpu.Maxwell3D();
-    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto stage_index = static_cast<std::size_t>(stage);
+    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
     const auto& entries = shader->GetShaderEntries().const_buffers;
 
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& used_buffer = entries[bindpoint];
-        const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
-
-        if (!buffer.enabled) {
-            // Set values to zero to unbind buffers
-            bind_ubo_pushbuffer.Push(0, 0, 0);
-            continue;
-        }
+        const auto& entry = entries[bindpoint];
+        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+    }
+}
 
-        std::size_t size = 0;
+void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const GLShader::ConstBufferEntry& entry) {
+    if (!buffer.enabled) {
+        // Set values to zero to unbind buffers
+        bind_ubo_pushbuffer.Push(0, 0, 0);
+        return;
+    }
 
-        if (used_buffer.IsIndirect()) {
-            // Buffer is accessed indirectly, so upload the entire thing
-            size = buffer.size;
+    std::size_t size;
+    if (entry.IsIndirect()) {
+        // Buffer is accessed indirectly, so upload the entire thing
+        size = buffer.size;
 
-            if (size > MaxConstbufferSize) {
-                LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                            MaxConstbufferSize);
-                size = MaxConstbufferSize;
-            }
-        } else {
-            // Buffer is accessed directly, upload just what we use
-            size = used_buffer.GetSize();
+        if (size > MaxConstbufferSize) {
+            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
+                        MaxConstbufferSize);
+            size = MaxConstbufferSize;
         }
+    } else {
+        // Buffer is accessed directly, upload just what we use
+        size = entry.GetSize();
+    }
 
-        // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
-        // UBO alignment requirements.
-        size = Common::AlignUp(size, sizeof(GLvec4));
-        ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
-
-        const GLintptr const_buffer_offset =
-            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
+    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
+    // UBO alignment requirements.
+    size = Common::AlignUp(size, sizeof(GLvec4));
+    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
 
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
-    }
+    const std::size_t alignment = device.GetUniformBufferAlignment();
+    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
 }
 
 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader, GLenum primitive_mode,
-                                          BaseBindings base_bindings) {
+                                          const Shader& shader) {
     const auto& entries = shader->GetShaderEntries().global_memory_entries;
     for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry{entries[bindpoint]};
@@ -874,7 +838,7 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
 }
 
 void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
-                                     GLuint program_handle, BaseBindings base_bindings) {
+                                     BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& gpu = system.GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
@@ -1152,10 +1116,17 @@ void RasterizerOpenGL::SyncPolygonOffset() {
     state.polygon_offset.clamp = regs.polygon_offset_clamp;
 }
 
-void RasterizerOpenGL::CheckAlphaTests() {
+void RasterizerOpenGL::SyncAlphaTest() {
     const auto& regs = system.GPU().Maxwell3D().regs;
     UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1,
                          "Alpha Testing is enabled with more than one rendertarget");
+
+    state.alpha_test.enabled = regs.alpha_test_enabled;
+    if (!state.alpha_test.enabled) {
+        return;
+    }
+    state.alpha_test.func = MaxwellToGL::ComparisonOp(regs.alpha_test_func);
+    state.alpha_test.ref = regs.alpha_test_ref;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d78094138..f7671ff5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -17,17 +17,18 @@
 #include <glad/glad.h>
 
 #include "common/common_types.h"
+#include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/utils.h"
@@ -106,17 +107,20 @@ private:
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                           GLuint program_handle, BaseBindings base_bindings);
+    void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures a constant buffer.
+    void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                          const GLShader::ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
     void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader, GLenum primitive_mode,
-                            BaseBindings base_bindings);
+                            const Shader& shader);
 
     /// Configures the current textures to use for the draw command.
     void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                       GLuint program_handle, BaseBindings base_bindings);
+                       BaseBindings base_bindings);
 
     /// Syncs the viewport and depth range to match the guest state
     void SyncViewport(OpenGLState& current_state);
@@ -167,8 +171,8 @@ private:
     /// Syncs the polygon offsets
     void SyncPolygonOffset();
 
-    /// Check asserts for alpha testing.
-    void CheckAlphaTests();
+    /// Syncs the alpha test state to match the guest state
+    void SyncAlphaTest();
 
     /// Check for extension that are not strictly required
     /// but are needed for correct emulation
@@ -197,7 +201,6 @@ private:
 
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
-    PrimitiveAssembler primitive_assembler{buffer_cache};
 
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index d66252224..ac8a9e6b7 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -35,8 +35,8 @@ struct UnspecializedShader {
 namespace {
 
 /// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
-    const auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()};
+GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
+    const auto& gpu{system.GPU().Maxwell3D()};
     const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
     return gpu.regs.code_address.CodeAddress() + shader_config.offset;
 }
@@ -350,7 +350,8 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, emu_window{emu_window}, device{device}, disk_cache{system} {}
+    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
+      disk_cache{system} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -546,42 +547,45 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }
 
 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.shaders) {
-        return last_shaders[static_cast<u32>(program)];
+    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+        return last_shaders[static_cast<std::size_t>(program)];
     }
 
-    auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-    const GPUVAddr program_addr{GetShaderAddress(program)};
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const GPUVAddr program_addr{GetShaderAddress(system, program)};
 
     // Look up shader in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(program_addr)};
+    const auto host_ptr{memory_manager.GetPointer(program_addr)};
     Shader shader{TryGet(host_ptr)};
+    if (shader) {
+        return last_shaders[static_cast<std::size_t>(program)] = shader;
+    }
 
-    if (!shader) {
-        // No shader found - create a new one
-        ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
-        ProgramCode program_code_b;
-        if (program == Maxwell::ShaderProgram::VertexA) {
-            const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
-            program_code_b = GetShaderCode(memory_manager, program_addr_b,
-                                           memory_manager.GetPointer(program_addr_b));
-        }
-        const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
-        const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
-        const auto found = precompiled_shaders.find(unique_identifier);
-        if (found != precompiled_shaders.end()) {
-            shader =
-                std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache,
-                                               precompiled_programs, found->second, host_ptr);
-        } else {
-            shader = std::make_shared<CachedShader>(
-                device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
-                std::move(program_code), std::move(program_code_b), host_ptr);
-        }
-        Register(shader);
+    // No shader found - create a new one
+    ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
+    ProgramCode program_code_b;
+    if (program == Maxwell::ShaderProgram::VertexA) {
+        const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
+        program_code_b = GetShaderCode(memory_manager, program_addr_b,
+                                       memory_manager.GetPointer(program_addr_b));
+    }
+
+    const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found != precompiled_shaders.end()) {
+        // Create a shader from the cache
+        shader = std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache,
+                                                precompiled_programs, found->second, host_ptr);
+    } else {
+        // Create a shader from guest memory
+        shader = std::make_shared<CachedShader>(
+            device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+            std::move(program_code), std::move(program_code_b), host_ptr);
     }
+    Register(shader);
 
-    return last_shaders[static_cast<u32>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = shader;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 64e5a5594..09bd0761d 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -137,6 +137,7 @@ private:
     CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                              const std::set<GLenum>& supported_formats);
 
+    Core::System& system;
     Core::Frontend::EmuWindow& emu_window;
     const Device& device;
     ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index e9f8d40db..7dc2e0560 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -45,7 +45,6 @@ struct TextureAoffi {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 
-enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
 
@@ -124,8 +123,8 @@ bool IsPrecise(Operation operand) {
     return false;
 }
 
-bool IsPrecise(Node node) {
-    if (const auto operation = std::get_if<OperationNode>(node)) {
+bool IsPrecise(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
         return IsPrecise(*operation);
     }
     return false;
@@ -144,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
     return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
 }
 
+constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "ssy";
+    case MetaStackClass::Pbk:
+        return "pbk";
+    }
+    return {};
+}
+
+std::string FlowStackName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
+}
+
+std::string FlowStackTopName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
+}
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@@ -174,8 +191,10 @@ public:
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
         constexpr u32 FLOW_STACK_SIZE = 20;
-        code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
-        code.AddLine("uint flow_stack_top = 0u;");
+        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        }
 
         code.AddLine("while (true) {{");
         ++code.scope;
@@ -247,6 +266,12 @@ private:
         code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices);
         code.AddNewLine();
 
+        code.AddLine("in gl_PerVertex {{");
+        ++code.scope;
+        code.AddLine("vec4 gl_Position;");
+        --code.scope;
+        code.AddLine("}} gl_in[];");
+
         DeclareVertexRedeclarations();
     }
 
@@ -349,7 +374,7 @@ private:
     }
 
     void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
-        const u32 generic_index{GetGenericAttributeIndex(index)};
+        const u32 location{GetGenericAttributeIndex(index)};
 
         std::string name{GetInputAttribute(index)};
         if (stage == ShaderStage::Geometry) {
@@ -358,19 +383,13 @@ private:
 
         std::string suffix;
         if (stage == ShaderStage::Fragment) {
-            const auto input_mode{header.ps.GetAttributeUse(generic_index)};
+            const auto input_mode{header.ps.GetAttributeUse(location)};
             if (skip_unused && input_mode == AttributeUse::Unused) {
                 return;
             }
             suffix = GetInputFlags(input_mode);
         }
 
-        u32 location = generic_index;
-        if (stage != ShaderStage::Vertex) {
-            // If inputs are varyings, add an offset
-            location += GENERIC_VARYING_START_LOCATION;
-        }
-
         code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix, name);
     }
 
@@ -395,7 +414,7 @@ private:
     }
 
     void DeclareOutputAttribute(Attribute::Index index) {
-        const u32 location{GetGenericAttributeIndex(index) + GENERIC_VARYING_START_LOCATION};
+        const u32 location{GetGenericAttributeIndex(index)};
         code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
     }
 
@@ -498,15 +517,15 @@ private:
     }
 
     void VisitBlock(const NodeBlock& bb) {
-        for (const Node node : bb) {
+        for (const auto& node : bb) {
             if (const std::string expr = Visit(node); !expr.empty()) {
                 code.AddLine(expr);
             }
         }
     }
 
-    std::string Visit(Node node) {
-        if (const auto operation = std::get_if<OperationNode>(node)) {
+    std::string Visit(const Node& node) {
+        if (const auto operation = std::get_if<OperationNode>(&*node)) {
             const auto operation_index = static_cast<std::size_t>(operation->GetCode());
             if (operation_index >= operation_decompilers.size()) {
                 UNREACHABLE_MSG("Out of bounds operation: {}", operation_index);
@@ -520,7 +539,7 @@ private:
             return (this->*decompiler)(*operation);
         }
 
-        if (const auto gpr = std::get_if<GprNode>(node)) {
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
             const u32 index = gpr->GetIndex();
             if (index == Register::ZeroIndex) {
                 return "0";
@@ -528,7 +547,7 @@ private:
             return GetRegister(index);
         }
 
-        if (const auto immediate = std::get_if<ImmediateNode>(node)) {
+        if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             const u32 value = immediate->GetValue();
             if (value < 10) {
                 // For eyecandy avoid using hex numbers on single digits
@@ -537,7 +556,7 @@ private:
             return fmt::format("utof(0x{:x}u)", immediate->GetValue());
         }
 
-        if (const auto predicate = std::get_if<PredicateNode>(node)) {
+        if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
             const auto value = [&]() -> std::string {
                 switch (const auto index = predicate->GetIndex(); index) {
                 case Tegra::Shader::Pred::UnusedIndex:
@@ -554,7 +573,7 @@ private:
             return value;
         }
 
-        if (const auto abuf = std::get_if<AbufNode>(node)) {
+        if (const auto abuf = std::get_if<AbufNode>(&*node)) {
             UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
                                  "Physical attributes in geometry shaders are not implemented");
             if (abuf->IsPhysicalBuffer()) {
@@ -564,9 +583,9 @@ private:
             return ReadAttribute(abuf->GetIndex(), abuf->GetElement(), abuf->GetBuffer());
         }
 
-        if (const auto cbuf = std::get_if<CbufNode>(node)) {
+        if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
             const Node offset = cbuf->GetOffset();
-            if (const auto immediate = std::get_if<ImmediateNode>(offset)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
@@ -577,30 +596,47 @@ private:
             if (std::holds_alternative<OperationNode>(*offset)) {
                 // Indirect access
                 const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = (ftou({}) / 4);", final_offset, Visit(offset));
-                return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()),
-                                   final_offset, final_offset);
+                code.AddLine("uint {} = ftou({}) >> 2;", final_offset, Visit(offset));
+
+                if (!device.HasComponentIndexingBug()) {
+                    return fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                       final_offset, final_offset);
+                }
+
+                // AMD's proprietary GLSL compiler emits ill code for variable component access.
+                // To bypass this driver bug generate 4 ifs, one per each component.
+                const std::string pack = code.GenerateTemporary();
+                code.AddLine("vec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                             final_offset);
+
+                const std::string result = code.GenerateTemporary();
+                code.AddLine("float {};", result);
+                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
+                                 pack, GetSwizzle(swizzle));
+                }
+                return result;
             }
 
             UNREACHABLE_MSG("Unmanaged offset node type");
         }
 
-        if (const auto gmem = std::get_if<GmemNode>(node)) {
+        if (const auto gmem = std::get_if<GmemNode>(&*node)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
             const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
             return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
         }
 
-        if (const auto lmem = std::get_if<LmemNode>(node)) {
+        if (const auto lmem = std::get_if<LmemNode>(&*node)) {
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         }
 
-        if (const auto internal_flag = std::get_if<InternalFlagNode>(node)) {
+        if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
             return GetInternalFlag(internal_flag->GetFlag());
         }
 
-        if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+        if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             // It's invalid to call conditional on nested nodes, use an operation instead
             code.AddLine("if ({}) {{", Visit(conditional->GetCondition()));
             ++code.scope;
@@ -612,7 +648,7 @@ private:
             return {};
         }
 
-        if (const auto comment = std::get_if<CommentNode>(node)) {
+        if (const auto comment = std::get_if<CommentNode>(&*node)) {
             return "// " + comment->GetText();
         }
 
@@ -620,7 +656,7 @@ private:
         return {};
     }
 
-    std::string ReadAttribute(Attribute::Index attribute, u32 element, Node buffer = {}) {
+    std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
         const auto GeometryPass = [&](std::string_view name) {
             if (stage == ShaderStage::Geometry && buffer) {
                 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
@@ -633,10 +669,14 @@ private:
 
         switch (attribute) {
         case Attribute::Index::Position:
-            if (stage != ShaderStage::Fragment) {
-                return GeometryPass("position") + GetSwizzle(element);
-            } else {
+            switch (stage) {
+            case ShaderStage::Geometry:
+                return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
+                                   GetSwizzle(element));
+            case ShaderStage::Fragment:
                 return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
+            default:
+                UNREACHABLE();
             }
         case Attribute::Index::PointCoord:
             switch (element) {
@@ -852,7 +892,7 @@ private:
         std::string expr = ", ";
         switch (type) {
         case Type::Int:
-            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*operand)) {
                 // Inline the string as an immediate integer in GLSL (some extra arguments are
                 // required to be constant)
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
@@ -884,7 +924,7 @@ private:
 
         for (std::size_t index = 0; index < aoffi.size(); ++index) {
             const auto operand{aoffi.at(index)};
-            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*operand)) {
                 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
                 // to be constant by the standard).
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
@@ -905,23 +945,23 @@ private:
     }
 
     std::string Assign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         std::string target;
-        if (const auto gpr = std::get_if<GprNode>(dest)) {
+        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
                 // Writing to Register::ZeroIndex is a no op
                 return {};
             }
             target = GetRegister(gpr->GetIndex());
-        } else if (const auto abuf = std::get_if<AbufNode>(dest)) {
+        } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
 
             target = [&]() -> std::string {
                 switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
                 case Attribute::Index::Position:
-                    return "position"s + GetSwizzle(abuf->GetElement());
+                    return "gl_Position"s + GetSwizzle(abuf->GetElement());
                 case Attribute::Index::PointSize:
                     return "gl_PointSize";
                 case Attribute::Index::ClipDistances0123:
@@ -937,9 +977,9 @@ private:
                     return "0";
                 }
             }();
-        } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
+        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
-        } else if (const auto gmem = std::get_if<GmemNode>(dest)) {
+        } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
             const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
@@ -1216,12 +1256,12 @@ private:
     }
 
     std::string LogicalAssign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         std::string target;
 
-        if (const auto pred = std::get_if<PredicateNode>(dest)) {
+        if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
             ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
 
             const auto index = pred->GetIndex();
@@ -1232,7 +1272,7 @@ private:
                 return {};
             }
             target = GetPredicate(index);
-        } else if (const auto flag = std::get_if<InternalFlagNode>(dest)) {
+        } else if (const auto flag = std::get_if<InternalFlagNode>(&*dest)) {
             target = GetInternalFlag(flag->GetFlag());
         }
 
@@ -1409,7 +1449,7 @@ private:
     }
 
     std::string Branch(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
         code.AddLine("jmp_to = 0x{:x}u;", target->GetValue());
@@ -1418,15 +1458,18 @@ private:
     }
 
     std::string PushFlowStack(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
+        code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
+                     target->GetValue());
         return {};
     }
 
     std::string PopFlowStack(Operation operation) {
-        code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
         code.AddLine("break;");
         return {};
     }
@@ -1447,27 +1490,9 @@ private:
 
         UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented");
 
-        code.AddLine("if (alpha_test[0] != 0) {{");
-        ++code.scope;
-        // We start on the register containing the alpha value in the first RT.
-        u32 current_reg = 3;
-        for (u32 render_target = 0; render_target < Maxwell::NumRenderTargets; ++render_target) {
-            // TODO(Blinkhawk): verify the behavior of alpha testing on hardware when
-            // multiple render targets are used.
-            if (header.ps.IsColorComponentOutputEnabled(render_target, 0) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 1) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 2) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 3)) {
-                code.AddLine("if (!AlphaFunc({})) discard;", SafeGetRegister(current_reg));
-                current_reg += 4;
-            }
-        }
-        --code.scope;
-        code.AddLine("}}");
-
         // Write the color outputs using the data in the shader registers, disabled
         // rendertargets/components are skipped in the register assignment.
-        current_reg = 0;
+        u32 current_reg = 0;
         for (u32 render_target = 0; render_target < Maxwell::NumRenderTargets; ++render_target) {
             // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
             for (u32 component = 0; component < 4; ++component) {
@@ -1506,9 +1531,7 @@ private:
 
         // If a geometry shader is attached, it will always flip (it's the last stage before
         // fragment). For more info about flipping, refer to gl_shader_gen.cpp.
-        code.AddLine("position.xy *= viewport_flip.xy;");
-        code.AddLine("gl_Position = position;");
-        code.AddLine("position.w = 1.0;");
+        code.AddLine("gl_Position.xy *= viewport_flip.xy;");
         code.AddLine("EmitVertex();");
         return {};
     }
@@ -1746,8 +1769,7 @@ private:
     }
 
     u32 GetNumPhysicalVaryings() const {
-        return std::min<u32>(device.GetMaxVaryings() - GENERIC_VARYING_START_LOCATION,
-                             Maxwell::NumVaryings);
+        return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
     }
 
     const Device& device;
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index d2bb705a9..9148629ec 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -23,12 +23,9 @@ ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setu
     out += GetCommonDeclarations();
 
     out += R"(
-layout (location = 0) out vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
 )";
@@ -48,7 +45,6 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 
     out += R"(
 void main() {
-    position = vec4(0.0, 0.0, 0.0, 0.0);
     execute_vertex();
 )";
 
@@ -59,19 +55,12 @@ void main() {
     out += R"(
 
     // Set Position Y direction
-    position.y *= utof(config_pack[2]);
+    gl_Position.y *= utof(config_pack[2]);
     // Check if the flip stage is VertexB
     // Config pack's second value is flip_stage
     if (config_pack[1] == 1) {
         // Viewport can be flipped, which is unsupported by glViewport
-        position.xy *= viewport_flip.xy;
-    }
-    gl_Position = position;
-
-    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
-    // For now, this is here to bring order in lieu of proper emulation
-    if (config_pack[1] == 1) {
-        position.w = 1.0;
+        gl_Position.xy *= viewport_flip.xy;
     }
 })";
 
@@ -85,13 +74,9 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
     out += GetCommonDeclarations();
 
     out += R"(
-layout (location = 0) in vec4 gs_position[];
-layout (location = 0) out vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
 )";
@@ -124,38 +109,11 @@ layout (location = 5) out vec4 FragColor5;
 layout (location = 6) out vec4 FragColor6;
 layout (location = 7) out vec4 FragColor7;
 
-layout (location = 0) in noperspective vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
-bool AlphaFunc(in float value) {
-    float ref = uintBitsToFloat(alpha_test[2]);
-    switch (alpha_test[1]) {
-        case 1:
-            return false;
-        case 2:
-            return value < ref;
-        case 3:
-            return value == ref;
-        case 4:
-            return value <= ref;
-        case 5:
-            return value > ref;
-        case 6:
-            return value != ref;
-        case 7:
-            return value >= ref;
-        case 8:
-            return true;
-        default:
-            return false;
-    }
-}
-
 )";
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 05ab01dcb..b05f90f20 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -48,17 +48,6 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shade
     viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
     viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 
-    auto func{static_cast<u32>(regs.alpha_test_func)};
-    // Normalize the gl variants of opCompare to be the same as the normal variants
-    const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never);
-    if (func >= op_gl_variant_base) {
-        func = func - op_gl_variant_base + 1U;
-    }
-
-    alpha_test.enabled = regs.alpha_test_enabled;
-    alpha_test.func = func;
-    alpha_test.ref = regs.alpha_test_ref;
-
     instance_id = state.current_instance;
 
     // Assign in which stage the position has to be flipped
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index cec18a832..6961e702a 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -27,14 +27,8 @@ struct MaxwellUniformData {
         GLuint flip_stage;
         GLfloat y_direction;
     };
-    struct alignas(16) {
-        GLuint enabled;
-        GLuint func;
-        GLfloat ref;
-        GLuint padding;
-    } alpha_test;
 };
-static_assert(sizeof(MaxwellUniformData) == 48, "MaxwellUniformData structure size is incorrect");
+static_assert(sizeof(MaxwellUniformData) == 32, "MaxwellUniformData structure size is incorrect");
 static_assert(sizeof(MaxwellUniformData) < 16384,
               "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec");
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 7425fbe5d..d86e137ac 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -156,6 +156,10 @@ OpenGLState::OpenGLState() {
     polygon_offset.factor = 0.0f;
     polygon_offset.units = 0.0f;
     polygon_offset.clamp = 0.0f;
+
+    alpha_test.enabled = false;
+    alpha_test.func = GL_ALWAYS;
+    alpha_test.ref = 0.0f;
 }
 
 void OpenGLState::ApplyDefaultState() {
@@ -461,6 +465,14 @@ void OpenGLState::ApplyPolygonOffset() const {
     }
 }
 
+void OpenGLState::ApplyAlphaTest() const {
+    Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled);
+    if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref),
+                  std::tie(alpha_test.func, alpha_test.ref))) {
+        glAlphaFunc(alpha_test.func, alpha_test.ref);
+    }
+}
+
 void OpenGLState::ApplyTextures() const {
     bool has_delta{};
     std::size_t first{};
@@ -533,6 +545,7 @@ void OpenGLState::Apply() const {
     ApplyTextures();
     ApplySamplers();
     ApplyPolygonOffset();
+    ApplyAlphaTest();
 }
 
 void OpenGLState::EmulateViewportWithScissor() {
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 41418a7b8..b0140495d 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -172,6 +172,12 @@ public:
         GLfloat clamp;
     } polygon_offset;
 
+    struct {
+        bool enabled; // GL_ALPHA_TEST
+        GLenum func;  // GL_ALPHA_TEST_FUNC
+        GLfloat ref;  // GL_ALPHA_TEST_REF
+    } alpha_test;
+
     std::array<bool, 8> clip_distance; // GL_CLIP_DISTANCE
 
     OpenGLState();
@@ -215,6 +221,7 @@ public:
     void ApplySamplers() const;
     void ApplyDepthClamp() const;
     void ApplyPolygonOffset() const;
+    void ApplyAlphaTest() const;
 
     /// Set the initial OpenGL state
     static void ApplyDefaultState();
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ed7b5cff0..ea77dd211 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -128,6 +128,8 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return GL_TRIANGLE_STRIP;
     case Maxwell::PrimitiveTopology::TriangleFan:
         return GL_TRIANGLE_FAN;
+    case Maxwell::PrimitiveTopology::Quads:
+        return GL_QUADS;
     default:
         LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
         UNREACHABLE();
@@ -173,11 +175,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         return GL_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::Border:
         return GL_CLAMP_TO_BORDER;
-    case Tegra::Texture::WrapMode::ClampOGL:
-        // TODO(Subv): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
-        // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
-        // manually mix them. However the shader part of this is not yet implemented.
-        return GL_CLAMP_TO_BORDER;
+    case Tegra::Texture::WrapMode::Clamp:
+        return GL_CLAMP;
     case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
         return GL_MIRROR_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3451d321d..aafd6f31b 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -18,7 +18,6 @@
 #include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
-#include "core/tracer/recorder.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"