aboutsummaryrefslogtreecommitdiff
path: root/src/video_core/renderer_opengl
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core/renderer_opengl')
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.cpp2126
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.h29
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp87
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h59
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp128
-rw-r--r--src/video_core/renderer_opengl/gl_device.h42
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp73
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.h52
-rw-r--r--src/video_core/renderer_opengl/gl_query_cache.cpp15
-rw-r--r--src/video_core/renderer_opengl/gl_query_cache.h14
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp864
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h98
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp18
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h25
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp459
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h109
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp436
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h39
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.cpp130
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.h27
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp123
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h59
-rw-r--r--src/video_core/renderer_opengl/gl_shader_util.cpp15
-rw-r--r--src/video_core/renderer_opengl/gl_shader_util.h2
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.cpp6
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.h28
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp64
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.h25
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp296
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h40
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h127
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp397
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h50
-rw-r--r--src/video_core/renderer_opengl/utils.cpp62
-rw-r--r--src/video_core/renderer_opengl/utils.h43
35 files changed, 4449 insertions, 1718 deletions
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..d6120c23e
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2126 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+ static constexpr std::string_view SWIZZLE{"xyzw"};
+ return SWIZZLE.at(component);
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+ return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+ ASSERT(IsGenericAttribute(index));
+ return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+ const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+ if (meta && meta->precise) {
+ return ".PREC";
+ }
+ return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+ switch (attribute) {
+ case PixelImap::Perspective:
+ return "";
+ case PixelImap::Constant:
+ return "FLAT ";
+ case PixelImap::ScreenLinear:
+ return "NOPERSPECTIVE ";
+ case PixelImap::Unused:
+ break;
+ }
+ UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+ return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+ switch (image_type) {
+ case Tegra::Shader::ImageType::Texture1D:
+ return "1D";
+ case Tegra::Shader::ImageType::TextureBuffer:
+ return "BUFFER";
+ case Tegra::Shader::ImageType::Texture1DArray:
+ return "ARRAY1D";
+ case Tegra::Shader::ImageType::Texture2D:
+ return "2D";
+ case Tegra::Shader::ImageType::Texture2DArray:
+ return "ARRAY2D";
+ case Tegra::Shader::ImageType::Texture3D:
+ return "3D";
+ }
+ UNREACHABLE();
+ return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+ switch (stack) {
+ case MetaStackClass::Ssy:
+ return "SSY";
+ case MetaStackClass::Pbk:
+ return "PBK";
+ }
+ UNREACHABLE();
+ return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+ switch (topology) {
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+ return "POINTS";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+ return "LINES";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+ return "LINES_ADJACENCY";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+ return "TRIANGLES";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+ return "TRIANGLES_ADJACENCY";
+ default:
+ UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+ return "POINTS";
+ }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+ switch (topology) {
+ case Tegra::Shader::OutputTopology::PointList:
+ return "POINTS";
+ case Tegra::Shader::OutputTopology::LineStrip:
+ return "LINE_STRIP";
+ case Tegra::Shader::OutputTopology::TriangleStrip:
+ return "TRIANGLE_STRIP";
+ default:
+ UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+ return "points";
+ }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+ switch (stage) {
+ case ShaderType::Vertex:
+ case ShaderType::Geometry:
+ return "vertex";
+ case ShaderType::Fragment:
+ return "fragment";
+ case ShaderType::Compute:
+ return "invocation";
+ default:
+ UNREACHABLE();
+ return "";
+ }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+ if (meta.sampler.is_buffer) {
+ return "BUFFER";
+ }
+ std::string type;
+ if (meta.sampler.is_shadow) {
+ type += "SHADOW";
+ }
+ if (meta.sampler.is_array) {
+ type += "ARRAY";
+ }
+ type += [&meta] {
+ switch (meta.sampler.type) {
+ case Tegra::Shader::TextureType::Texture1D:
+ return "1D";
+ case Tegra::Shader::TextureType::Texture2D:
+ return "2D";
+ case Tegra::Shader::TextureType::Texture3D:
+ return "3D";
+ case Tegra::Shader::TextureType::TextureCube:
+ return "CUBE";
+ }
+ UNREACHABLE();
+ return "2D";
+ }();
+ return type;
+}
+
+class ARBDecompiler final {
+public:
+ explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+ ShaderType stage, std::string_view identifier);
+
+ std::string Code() const {
+ return shader_source;
+ }
+
+private:
+ void DefineGlobalMemory();
+
+ void DeclareHeader();
+ void DeclareVertex();
+ void DeclareGeometry();
+ void DeclareFragment();
+ void DeclareCompute();
+ void DeclareInputAttributes();
+ void DeclareOutputAttributes();
+ void DeclareLocalMemory();
+ void DeclareGlobalMemory();
+ void DeclareConstantBuffers();
+ void DeclareRegisters();
+ void DeclareTemporaries();
+ void DeclarePredicates();
+ void DeclareInternalFlags();
+
+ void InitializeVariables();
+
+ void DecompileAST();
+ void DecompileBranchMode();
+
+ void VisitAST(const ASTNode& node);
+ std::string VisitExpression(const Expr& node);
+
+ void VisitBlock(const NodeBlock& bb);
+
+ std::string Visit(const Node& node);
+
+ std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
+ std::string BuildAoffi(Operation);
+ std::string GlobalMemoryPointer(const GmemNode& gmem);
+ void Exit();
+
+ std::string Assign(Operation);
+ std::string Select(Operation);
+ std::string FClamp(Operation);
+ std::string FCastHalf0(Operation);
+ std::string FCastHalf1(Operation);
+ std::string FSqrt(Operation);
+ std::string FSwizzleAdd(Operation);
+ std::string HAdd2(Operation);
+ std::string HMul2(Operation);
+ std::string HFma2(Operation);
+ std::string HAbsolute(Operation);
+ std::string HNegate(Operation);
+ std::string HClamp(Operation);
+ std::string HCastFloat(Operation);
+ std::string HUnpack(Operation);
+ std::string HMergeF32(Operation);
+ std::string HMergeH0(Operation);
+ std::string HMergeH1(Operation);
+ std::string HPack2(Operation);
+ std::string LogicalAssign(Operation);
+ std::string LogicalPick2(Operation);
+ std::string LogicalAnd2(Operation);
+ std::string FloatOrdered(Operation);
+ std::string FloatUnordered(Operation);
+ std::string LogicalAddCarry(Operation);
+ std::string Texture(Operation);
+ std::string TextureGather(Operation);
+ std::string TextureQueryDimensions(Operation);
+ std::string TextureQueryLod(Operation);
+ std::string TexelFetch(Operation);
+ std::string TextureGradient(Operation);
+ std::string ImageLoad(Operation);
+ std::string ImageStore(Operation);
+ std::string Branch(Operation);
+ std::string BranchIndirect(Operation);
+ std::string PushFlowStack(Operation);
+ std::string PopFlowStack(Operation);
+ std::string Exit(Operation);
+ std::string Discard(Operation);
+ std::string EmitVertex(Operation);
+ std::string EndPrimitive(Operation);
+ std::string InvocationId(Operation);
+ std::string YNegate(Operation);
+ std::string ThreadId(Operation);
+ std::string ShuffleIndexed(Operation);
+ std::string Barrier(Operation);
+ std::string MemoryBarrierGroup(Operation);
+ std::string MemoryBarrierGlobal(Operation);
+
+ template <const std::string_view& op>
+ std::string Unary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+ return temporary;
+ }
+
+ template <const std::string_view& op>
+ std::string Binary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+ Visit(operation[1]));
+ return temporary;
+ }
+
+ template <const std::string_view& op>
+ std::string Trinary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+ Visit(operation[1]), Visit(operation[2]));
+ return temporary;
+ }
+
+ template <const std::string_view& op, bool unordered>
+ std::string FloatComparison(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+ const std::string op_a = Visit(operation[0]);
+ const std::string op_b = Visit(operation[1]);
+ if constexpr (unordered) {
+ AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+ AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+ } else if (op == SNE_F) {
+ AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), 0;", temporary);
+ AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), 0;", temporary);
+ }
+ return temporary;
+ }
+
+ template <const std::string_view& op, bool is_nan>
+ std::string HalfComparison(Operation operation) {
+ std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ const std::string op_a = Visit(operation[0]);
+ const std::string op_b = Visit(operation[1]);
+ AddLine("UP2H.F {}, {};", tmp1, op_a);
+ AddLine("UP2H.F {}, {};", tmp2, op_b);
+ AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+ AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+ AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+ AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+ AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+ if constexpr (is_nan) {
+ AddLine("MOVC.F RC.x, {};", op_a);
+ AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+ AddLine("MOVC.F RC.x, {};", op_b);
+ AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+ }
+ return tmp1;
+ }
+
+ template <const std::string_view& op, const std::string_view& type>
+ std::string AtomicImage(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t num_coords = operation.GetOperandsCount();
+ const std::size_t num_values = meta.values.size();
+
+ const std::string coord = AllocVectorTemporary();
+ const std::string value = AllocVectorTemporary();
+ for (std::size_t i = 0; i < num_coords; ++i) {
+ AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+ }
+ for (std::size_t i = 0; i < num_values; ++i) {
+ AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+ }
+
+ AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+ image_id, ImageType(meta.image.type));
+ return fmt::format("{}.x", coord);
+ }
+
+ template <const std::string_view& op, const std::string_view& type>
+ std::string Atomic(Operation operation) {
+ std::string temporary = AllocTemporary();
+ std::string address;
+ std::string_view opname;
+ bool robust = false;
+ if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+ address = GlobalMemoryPointer(*gmem);
+ opname = "ATOM";
+ robust = true;
+ } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+ address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+ opname = "ATOMS";
+ } else {
+ UNREACHABLE();
+ return "{0, 0, 0, 0}";
+ }
+ if (robust) {
+ AddLine("IF NE.x;");
+ }
+ AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+ if (robust) {
+ AddLine("ELSE;");
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("ENDIF;");
+ }
+ return temporary;
+ }
+
+ template <char type>
+ std::string Negate(Operation operation) {
+ std::string temporary = AllocTemporary();
+ if constexpr (type == 'F') {
+ AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+ } else {
+ AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+ }
+ return temporary;
+ }
+
+ template <char type>
+ std::string Absolute(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+ return temporary;
+ }
+
+ template <char type>
+ std::string BitfieldInsert(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+ AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+ AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+ Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ template <char type>
+ std::string BitfieldExtract(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+ AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+ AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ template <char swizzle>
+ std::string LocalInvocationId(Operation) {
+ return fmt::format("invocation.localid.{}", swizzle);
+ }
+
+ template <char swizzle>
+ std::string WorkGroupId(Operation) {
+ return fmt::format("invocation.groupid.{}", swizzle);
+ }
+
+ template <char c1, char c2>
+ std::string ThreadMask(Operation) {
+ return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+ }
+
+ template <typename... Args>
+ void AddExpression(std::string_view text, Args&&... args) {
+ shader_source += fmt::format(text, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ void AddLine(std::string_view text, Args&&... args) {
+ AddExpression(text, std::forward<Args>(args)...);
+ shader_source += '\n';
+ }
+
+ std::string AllocLongVectorTemporary() {
+ max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1);
+ return fmt::format("L{}", num_long_temporaries++);
+ }
+
+ std::string AllocLongTemporary() {
+ return fmt::format("{}.x", AllocLongVectorTemporary());
+ }
+
+ std::string AllocVectorTemporary() {
+ max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+ return fmt::format("T{}", num_temporaries++);
+ }
+
+ std::string AllocTemporary() {
+ return fmt::format("{}.x", AllocVectorTemporary());
+ }
+
+ void ResetTemporaries() noexcept {
+ num_temporaries = 0;
+ num_long_temporaries = 0;
+ }
+
+ const Device& device;
+ const ShaderIR& ir;
+ const Registry& registry;
+ const ShaderType stage;
+
+ std::size_t num_temporaries = 0;
+ std::size_t max_temporaries = 0;
+
+ std::size_t num_long_temporaries = 0;
+ std::size_t max_long_temporaries = 0;
+
+ std::map<GlobalMemoryBase, u32> global_memory_names;
+
+ std::string shader_source;
+
+ static constexpr std::string_view ADD_F32 = "ADD.F32";
+ static constexpr std::string_view ADD_S = "ADD.S";
+ static constexpr std::string_view ADD_U = "ADD.U";
+ static constexpr std::string_view MUL_F32 = "MUL.F32";
+ static constexpr std::string_view MUL_S = "MUL.S";
+ static constexpr std::string_view MUL_U = "MUL.U";
+ static constexpr std::string_view DIV_F32 = "DIV.F32";
+ static constexpr std::string_view DIV_S = "DIV.S";
+ static constexpr std::string_view DIV_U = "DIV.U";
+ static constexpr std::string_view MAD_F32 = "MAD.F32";
+ static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+ static constexpr std::string_view COS_F32 = "COS.F32";
+ static constexpr std::string_view SIN_F32 = "SIN.F32";
+ static constexpr std::string_view EX2_F32 = "EX2.F32";
+ static constexpr std::string_view LG2_F32 = "LG2.F32";
+ static constexpr std::string_view SLT_F = "SLT.F32";
+ static constexpr std::string_view SLT_S = "SLT.S";
+ static constexpr std::string_view SLT_U = "SLT.U";
+ static constexpr std::string_view SEQ_F = "SEQ.F32";
+ static constexpr std::string_view SEQ_S = "SEQ.S";
+ static constexpr std::string_view SEQ_U = "SEQ.U";
+ static constexpr std::string_view SLE_F = "SLE.F32";
+ static constexpr std::string_view SLE_S = "SLE.S";
+ static constexpr std::string_view SLE_U = "SLE.U";
+ static constexpr std::string_view SGT_F = "SGT.F32";
+ static constexpr std::string_view SGT_S = "SGT.S";
+ static constexpr std::string_view SGT_U = "SGT.U";
+ static constexpr std::string_view SNE_F = "SNE.F32";
+ static constexpr std::string_view SNE_S = "SNE.S";
+ static constexpr std::string_view SNE_U = "SNE.U";
+ static constexpr std::string_view SGE_F = "SGE.F32";
+ static constexpr std::string_view SGE_S = "SGE.S";
+ static constexpr std::string_view SGE_U = "SGE.U";
+ static constexpr std::string_view AND_S = "AND.S";
+ static constexpr std::string_view AND_U = "AND.U";
+ static constexpr std::string_view TRUNC_F = "TRUNC.F";
+ static constexpr std::string_view TRUNC_S = "TRUNC.S";
+ static constexpr std::string_view TRUNC_U = "TRUNC.U";
+ static constexpr std::string_view SHL_S = "SHL.S";
+ static constexpr std::string_view SHL_U = "SHL.U";
+ static constexpr std::string_view SHR_S = "SHR.S";
+ static constexpr std::string_view SHR_U = "SHR.U";
+ static constexpr std::string_view OR_S = "OR.S";
+ static constexpr std::string_view OR_U = "OR.U";
+ static constexpr std::string_view XOR_S = "XOR.S";
+ static constexpr std::string_view XOR_U = "XOR.U";
+ static constexpr std::string_view NOT_S = "NOT.S";
+ static constexpr std::string_view NOT_U = "NOT.U";
+ static constexpr std::string_view BTC_S = "BTC.S";
+ static constexpr std::string_view BTC_U = "BTC.U";
+ static constexpr std::string_view BTFM_S = "BTFM.S";
+ static constexpr std::string_view BTFM_U = "BTFM.U";
+ static constexpr std::string_view ROUND_F = "ROUND.F";
+ static constexpr std::string_view CEIL_F = "CEIL.F";
+ static constexpr std::string_view FLR_F = "FLR.F";
+ static constexpr std::string_view I2F_S = "I2F.S";
+ static constexpr std::string_view I2F_U = "I2F.U";
+ static constexpr std::string_view MIN_F = "MIN.F";
+ static constexpr std::string_view MIN_S = "MIN.S";
+ static constexpr std::string_view MIN_U = "MIN.U";
+ static constexpr std::string_view MAX_F = "MAX.F";
+ static constexpr std::string_view MAX_S = "MAX.S";
+ static constexpr std::string_view MAX_U = "MAX.U";
+ static constexpr std::string_view MOV_U = "MOV.U";
+ static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+ static constexpr std::string_view TGALL_U = "TGALL.U";
+ static constexpr std::string_view TGANY_U = "TGANY.U";
+ static constexpr std::string_view TGEQ_U = "TGEQ.U";
+ static constexpr std::string_view EXCH = "EXCH";
+ static constexpr std::string_view ADD = "ADD";
+ static constexpr std::string_view MIN = "MIN";
+ static constexpr std::string_view MAX = "MAX";
+ static constexpr std::string_view AND = "AND";
+ static constexpr std::string_view OR = "OR";
+ static constexpr std::string_view XOR = "XOR";
+ static constexpr std::string_view U32 = "U32";
+ static constexpr std::string_view S32 = "S32";
+
+ static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+ using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+ static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+ &ARBDecompiler::Assign,
+
+ &ARBDecompiler::Select,
+
+ &ARBDecompiler::Binary<ADD_F32>,
+ &ARBDecompiler::Binary<MUL_F32>,
+ &ARBDecompiler::Binary<DIV_F32>,
+ &ARBDecompiler::Trinary<MAD_F32>,
+ &ARBDecompiler::Negate<'F'>,
+ &ARBDecompiler::Absolute<'F'>,
+ &ARBDecompiler::FClamp,
+ &ARBDecompiler::FCastHalf0,
+ &ARBDecompiler::FCastHalf1,
+ &ARBDecompiler::Binary<MIN_F>,
+ &ARBDecompiler::Binary<MAX_F>,
+ &ARBDecompiler::Unary<COS_F32>,
+ &ARBDecompiler::Unary<SIN_F32>,
+ &ARBDecompiler::Unary<EX2_F32>,
+ &ARBDecompiler::Unary<LG2_F32>,
+ &ARBDecompiler::Unary<RSQ_F32>,
+ &ARBDecompiler::FSqrt,
+ &ARBDecompiler::Unary<ROUND_F>,
+ &ARBDecompiler::Unary<FLR_F>,
+ &ARBDecompiler::Unary<CEIL_F>,
+ &ARBDecompiler::Unary<TRUNC_F>,
+ &ARBDecompiler::Unary<I2F_S>,
+ &ARBDecompiler::Unary<I2F_U>,
+ &ARBDecompiler::FSwizzleAdd,
+
+ &ARBDecompiler::Binary<ADD_S>,
+ &ARBDecompiler::Binary<MUL_S>,
+ &ARBDecompiler::Binary<DIV_S>,
+ &ARBDecompiler::Negate<'S'>,
+ &ARBDecompiler::Absolute<'S'>,
+ &ARBDecompiler::Binary<MIN_S>,
+ &ARBDecompiler::Binary<MAX_S>,
+
+ &ARBDecompiler::Unary<TRUNC_S>,
+ &ARBDecompiler::Unary<MOV_U>,
+ &ARBDecompiler::Binary<SHL_S>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<SHR_S>,
+ &ARBDecompiler::Binary<AND_S>,
+ &ARBDecompiler::Binary<OR_S>,
+ &ARBDecompiler::Binary<XOR_S>,
+ &ARBDecompiler::Unary<NOT_S>,
+ &ARBDecompiler::BitfieldInsert<'S'>,
+ &ARBDecompiler::BitfieldExtract<'S'>,
+ &ARBDecompiler::Unary<BTC_S>,
+ &ARBDecompiler::Unary<BTFM_S>,
+
+ &ARBDecompiler::Binary<ADD_U>,
+ &ARBDecompiler::Binary<MUL_U>,
+ &ARBDecompiler::Binary<DIV_U>,
+ &ARBDecompiler::Binary<MIN_U>,
+ &ARBDecompiler::Binary<MAX_U>,
+ &ARBDecompiler::Unary<TRUNC_U>,
+ &ARBDecompiler::Unary<MOV_U>,
+ &ARBDecompiler::Binary<SHL_U>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<AND_U>,
+ &ARBDecompiler::Binary<OR_U>,
+ &ARBDecompiler::Binary<XOR_U>,
+ &ARBDecompiler::Unary<NOT_U>,
+ &ARBDecompiler::BitfieldInsert<'U'>,
+ &ARBDecompiler::BitfieldExtract<'U'>,
+ &ARBDecompiler::Unary<BTC_U>,
+ &ARBDecompiler::Unary<BTFM_U>,
+
+ &ARBDecompiler::HAdd2,
+ &ARBDecompiler::HMul2,
+ &ARBDecompiler::HFma2,
+ &ARBDecompiler::HAbsolute,
+ &ARBDecompiler::HNegate,
+ &ARBDecompiler::HClamp,
+ &ARBDecompiler::HCastFloat,
+ &ARBDecompiler::HUnpack,
+ &ARBDecompiler::HMergeF32,
+ &ARBDecompiler::HMergeH0,
+ &ARBDecompiler::HMergeH1,
+ &ARBDecompiler::HPack2,
+
+ &ARBDecompiler::LogicalAssign,
+ &ARBDecompiler::Binary<AND_U>,
+ &ARBDecompiler::Binary<OR_U>,
+ &ARBDecompiler::Binary<XOR_U>,
+ &ARBDecompiler::Unary<NOT_U>,
+ &ARBDecompiler::LogicalPick2,
+ &ARBDecompiler::LogicalAnd2,
+
+ &ARBDecompiler::FloatComparison<SLT_F, false>,
+ &ARBDecompiler::FloatComparison<SEQ_F, false>,
+ &ARBDecompiler::FloatComparison<SLE_F, false>,
+ &ARBDecompiler::FloatComparison<SGT_F, false>,
+ &ARBDecompiler::FloatComparison<SNE_F, false>,
+ &ARBDecompiler::FloatComparison<SGE_F, false>,
+ &ARBDecompiler::FloatOrdered,
+ &ARBDecompiler::FloatUnordered,
+ &ARBDecompiler::FloatComparison<SLT_F, true>,
+ &ARBDecompiler::FloatComparison<SEQ_F, true>,
+ &ARBDecompiler::FloatComparison<SLE_F, true>,
+ &ARBDecompiler::FloatComparison<SGT_F, true>,
+ &ARBDecompiler::FloatComparison<SNE_F, true>,
+ &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+ &ARBDecompiler::Binary<SLT_S>,
+ &ARBDecompiler::Binary<SEQ_S>,
+ &ARBDecompiler::Binary<SLE_S>,
+ &ARBDecompiler::Binary<SGT_S>,
+ &ARBDecompiler::Binary<SNE_S>,
+ &ARBDecompiler::Binary<SGE_S>,
+
+ &ARBDecompiler::Binary<SLT_U>,
+ &ARBDecompiler::Binary<SEQ_U>,
+ &ARBDecompiler::Binary<SLE_U>,
+ &ARBDecompiler::Binary<SGT_U>,
+ &ARBDecompiler::Binary<SNE_U>,
+ &ARBDecompiler::Binary<SGE_U>,
+
+ &ARBDecompiler::LogicalAddCarry,
+
+ &ARBDecompiler::HalfComparison<SLT_F, false>,
+ &ARBDecompiler::HalfComparison<SEQ_F, false>,
+ &ARBDecompiler::HalfComparison<SLE_F, false>,
+ &ARBDecompiler::HalfComparison<SGT_F, false>,
+ &ARBDecompiler::HalfComparison<SNE_F, false>,
+ &ARBDecompiler::HalfComparison<SGE_F, false>,
+ &ARBDecompiler::HalfComparison<SLT_F, true>,
+ &ARBDecompiler::HalfComparison<SEQ_F, true>,
+ &ARBDecompiler::HalfComparison<SLE_F, true>,
+ &ARBDecompiler::HalfComparison<SGT_F, true>,
+ &ARBDecompiler::HalfComparison<SNE_F, true>,
+ &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+ &ARBDecompiler::Texture,
+ &ARBDecompiler::Texture,
+ &ARBDecompiler::TextureGather,
+ &ARBDecompiler::TextureQueryDimensions,
+ &ARBDecompiler::TextureQueryLod,
+ &ARBDecompiler::TexelFetch,
+ &ARBDecompiler::TextureGradient,
+
+ &ARBDecompiler::ImageLoad,
+ &ARBDecompiler::ImageStore,
+
+ &ARBDecompiler::AtomicImage<ADD, U32>,
+ &ARBDecompiler::AtomicImage<AND, U32>,
+ &ARBDecompiler::AtomicImage<OR, U32>,
+ &ARBDecompiler::AtomicImage<XOR, U32>,
+ &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+ &ARBDecompiler::Atomic<EXCH, U32>,
+ &ARBDecompiler::Atomic<ADD, U32>,
+ &ARBDecompiler::Atomic<MIN, U32>,
+ &ARBDecompiler::Atomic<MAX, U32>,
+ &ARBDecompiler::Atomic<AND, U32>,
+ &ARBDecompiler::Atomic<OR, U32>,
+ &ARBDecompiler::Atomic<XOR, U32>,
+
+ &ARBDecompiler::Atomic<EXCH, S32>,
+ &ARBDecompiler::Atomic<ADD, S32>,
+ &ARBDecompiler::Atomic<MIN, S32>,
+ &ARBDecompiler::Atomic<MAX, S32>,
+ &ARBDecompiler::Atomic<AND, S32>,
+ &ARBDecompiler::Atomic<OR, S32>,
+ &ARBDecompiler::Atomic<XOR, S32>,
+
+ &ARBDecompiler::Atomic<ADD, U32>,
+ &ARBDecompiler::Atomic<MIN, U32>,
+ &ARBDecompiler::Atomic<MAX, U32>,
+ &ARBDecompiler::Atomic<AND, U32>,
+ &ARBDecompiler::Atomic<OR, U32>,
+ &ARBDecompiler::Atomic<XOR, U32>,
+
+ &ARBDecompiler::Atomic<ADD, S32>,
+ &ARBDecompiler::Atomic<MIN, S32>,
+ &ARBDecompiler::Atomic<MAX, S32>,
+ &ARBDecompiler::Atomic<AND, S32>,
+ &ARBDecompiler::Atomic<OR, S32>,
+ &ARBDecompiler::Atomic<XOR, S32>,
+
+ &ARBDecompiler::Branch,
+ &ARBDecompiler::BranchIndirect,
+ &ARBDecompiler::PushFlowStack,
+ &ARBDecompiler::PopFlowStack,
+ &ARBDecompiler::Exit,
+ &ARBDecompiler::Discard,
+
+ &ARBDecompiler::EmitVertex,
+ &ARBDecompiler::EndPrimitive,
+
+ &ARBDecompiler::InvocationId,
+ &ARBDecompiler::YNegate,
+ &ARBDecompiler::LocalInvocationId<'x'>,
+ &ARBDecompiler::LocalInvocationId<'y'>,
+ &ARBDecompiler::LocalInvocationId<'z'>,
+ &ARBDecompiler::WorkGroupId<'x'>,
+ &ARBDecompiler::WorkGroupId<'y'>,
+ &ARBDecompiler::WorkGroupId<'z'>,
+
+ &ARBDecompiler::Unary<TGBALLOT_U>,
+ &ARBDecompiler::Unary<TGALL_U>,
+ &ARBDecompiler::Unary<TGANY_U>,
+ &ARBDecompiler::Unary<TGEQ_U>,
+
+ &ARBDecompiler::ThreadId,
+ &ARBDecompiler::ThreadMask<'e', 'q'>,
+ &ARBDecompiler::ThreadMask<'g', 'e'>,
+ &ARBDecompiler::ThreadMask<'g', 't'>,
+ &ARBDecompiler::ThreadMask<'l', 'e'>,
+ &ARBDecompiler::ThreadMask<'l', 't'>,
+ &ARBDecompiler::ShuffleIndexed,
+
+ &ARBDecompiler::Barrier,
+ &ARBDecompiler::MemoryBarrierGroup,
+ &ARBDecompiler::MemoryBarrierGlobal,
+ };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+ ShaderType stage, std::string_view identifier)
+ : device{device}, ir{ir}, registry{registry}, stage{stage} {
+ DefineGlobalMemory();
+
+ AddLine("TEMP RC;");
+ AddLine("TEMP FSWZA[4];");
+ AddLine("TEMP FSWZB[4];");
+ if (ir.IsDecompiled()) {
+ DecompileAST();
+ } else {
+ DecompileBranchMode();
+ }
+ AddLine("END");
+
+ const std::string code = std::move(shader_source);
+ DeclareHeader();
+ DeclareVertex();
+ DeclareGeometry();
+ DeclareFragment();
+ DeclareCompute();
+ DeclareInputAttributes();
+ DeclareOutputAttributes();
+ DeclareLocalMemory();
+ DeclareGlobalMemory();
+ DeclareConstantBuffers();
+ DeclareRegisters();
+ DeclareTemporaries();
+ DeclarePredicates();
+ DeclareInternalFlags();
+
+ shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+ switch (stage) {
+ case ShaderType::Vertex:
+ return "vp";
+ case ShaderType::Geometry:
+ return "gp";
+ case ShaderType::Fragment:
+ return "fp";
+ case ShaderType::Compute:
+ return "cp";
+ default:
+ UNREACHABLE();
+ return "";
+ }
+}
+
+void ARBDecompiler::DefineGlobalMemory() {
+ u32 binding = 0;
+ for (const auto& pair : ir.GetGlobalMemory()) {
+ const GlobalMemoryBase base = pair.first;
+ global_memory_names.emplace(base, binding);
+ ++binding;
+ }
+}
+
+void ARBDecompiler::DeclareHeader() {
+ AddLine("!!NV{}5.0", HeaderStageName(stage));
+ // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+ AddLine("OPTION NV_internal;");
+ AddLine("OPTION NV_gpu_program_fp64;");
+ AddLine("OPTION NV_shader_thread_group;");
+ if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+ AddLine("OPTION NV_shader_thread_shuffle;");
+ }
+ if (stage == ShaderType::Vertex) {
+ if (device.HasNvViewportArray2()) {
+ AddLine("OPTION NV_viewport_array2;");
+ }
+ }
+ if (stage == ShaderType::Fragment) {
+ AddLine("OPTION ARB_draw_buffers;");
+ }
+ if (device.HasImageLoadFormatted()) {
+ AddLine("OPTION EXT_shader_image_load_formatted;");
+ }
+}
+
+void ARBDecompiler::DeclareVertex() {
+ if (stage != ShaderType::Vertex) {
+ return;
+ }
+ AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+ if (stage != ShaderType::Geometry) {
+ return;
+ }
+ const auto& info = registry.GetGraphicsInfo();
+ const auto& header = ir.GetHeader();
+ AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+ AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+ AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+ AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+ if (stage != ShaderType::Fragment) {
+ return;
+ }
+ AddLine("OUTPUT result_color7 = result.color[7];");
+ AddLine("OUTPUT result_color6 = result.color[6];");
+ AddLine("OUTPUT result_color5 = result.color[5];");
+ AddLine("OUTPUT result_color4 = result.color[4];");
+ AddLine("OUTPUT result_color3 = result.color[3];");
+ AddLine("OUTPUT result_color2 = result.color[2];");
+ AddLine("OUTPUT result_color1 = result.color[1];");
+ AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+ if (stage != ShaderType::Compute) {
+ return;
+ }
+ const ComputeInfo& info = registry.GetComputeInfo();
+ AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+ info.workgroup_size[2]);
+ if (info.shared_memory_size_in_words == 0) {
+ return;
+ }
+ const u32 limit = device.GetMaxComputeSharedMemorySize();
+ u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+ if (size_in_bytes > limit) {
+ LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}",
+ size_in_bytes, limit);
+ size_in_bytes = limit;
+ }
+
+ AddLine("SHARED_MEMORY {};", size_in_bytes);
+ AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+ if (stage == ShaderType::Compute) {
+ return;
+ }
+ const std::string_view stage_name = StageInputName(stage);
+ for (const auto attribute : ir.GetInputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+
+ std::string_view suffix;
+ if (stage == ShaderType::Fragment) {
+ const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+ if (input_mode == PixelImap::Unused) {
+ return;
+ }
+ suffix = GetInputFlags(input_mode);
+ }
+ AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+ index);
+ }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+ if (stage == ShaderType::Compute) {
+ return;
+ }
+ for (const auto attribute : ir.GetOutputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+ AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+ }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+ u64 size = 0;
+ if (stage == ShaderType::Compute) {
+ size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+ } else {
+ size = ir.GetHeader().GetLocalMemorySize();
+ }
+ if (size == 0) {
+ return;
+ }
+ const u64 element_count = Common::AlignUp(size, 4) / 4;
+ AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+ const size_t num_entries = ir.GetGlobalMemory().size();
+ if (num_entries > 0) {
+ AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1);
+ }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+ u32 binding = 0;
+ for (const auto& cbuf : ir.GetConstantBuffers()) {
+ AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+ ++binding;
+ }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+ for (const u32 gpr : ir.GetRegisters()) {
+ AddLine("TEMP R{};", gpr);
+ }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+ for (std::size_t i = 0; i < max_temporaries; ++i) {
+ AddLine("TEMP T{};", i);
+ }
+ for (std::size_t i = 0; i < max_long_temporaries; ++i) {
+ AddLine("LONG TEMP L{};", i);
+ }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+ for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+ AddLine("TEMP P{};", static_cast<u64>(pred));
+ }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+ for (const char* name : INTERNAL_FLAG_NAMES) {
+ AddLine("TEMP {};", name);
+ }
+}
+
+void ARBDecompiler::InitializeVariables() {
+ AddLine("MOV.F32 FSWZA[0], -1;");
+ AddLine("MOV.F32 FSWZA[1], 1;");
+ AddLine("MOV.F32 FSWZA[2], -1;");
+ AddLine("MOV.F32 FSWZA[3], 0;");
+ AddLine("MOV.F32 FSWZB[0], -1;");
+ AddLine("MOV.F32 FSWZB[1], -1;");
+ AddLine("MOV.F32 FSWZB[2], 1;");
+ AddLine("MOV.F32 FSWZB[3], -1;");
+
+ if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+ AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+ }
+ for (const auto attribute : ir.GetOutputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+ AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+ }
+ for (const u32 gpr : ir.GetRegisters()) {
+ AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+ }
+ for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+ AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+ }
+}
+
+void ARBDecompiler::DecompileAST() {
+ const u32 num_flow_variables = ir.GetASTNumVariables();
+ for (u32 i = 0; i < num_flow_variables; ++i) {
+ AddLine("TEMP F{};", i);
+ }
+ for (u32 i = 0; i < num_flow_variables; ++i) {
+ AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+ }
+
+ InitializeVariables();
+
+ VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+ static constexpr u32 FLOW_STACK_SIZE = 20;
+ if (!ir.IsFlowStackDisabled()) {
+ AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+ AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+ AddLine("TEMP SSY_TOP;");
+ AddLine("TEMP PBK_TOP;");
+ }
+
+ AddLine("TEMP PC;");
+
+ if (!ir.IsFlowStackDisabled()) {
+ AddLine("MOV.U SSY_TOP.x, 0;");
+ AddLine("MOV.U PBK_TOP.x, 0;");
+ }
+
+ InitializeVariables();
+
+ const auto basic_block_end = ir.GetBasicBlocks().end();
+ auto basic_block_it = ir.GetBasicBlocks().begin();
+ const u32 first_address = basic_block_it->first;
+ AddLine("MOV.U PC.x, {};", first_address);
+
+ AddLine("REP;");
+
+ std::size_t num_blocks = 0;
+ while (basic_block_it != basic_block_end) {
+ const auto& [address, bb] = *basic_block_it;
+ ++num_blocks;
+
+ AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+ AddLine("IF NE.x;");
+
+ VisitBlock(bb);
+
+ ++basic_block_it;
+
+ if (basic_block_it != basic_block_end) {
+ const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+ if (!op || op->GetCode() != OperationCode::Branch) {
+ const u32 next_address = basic_block_it->first;
+ AddLine("MOV.U PC.x, {};", next_address);
+ AddLine("CONT;");
+ }
+ }
+
+ AddLine("ELSE;");
+ }
+ AddLine("RET;");
+ while (num_blocks--) {
+ AddLine("ENDIF;");
+ }
+
+ AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+ if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+ const std::string condition = VisitExpression(ast->condition);
+ ResetTemporaries();
+
+ AddLine("MOVC.U RC.x, {};", condition);
+ AddLine("IF NE.x;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ AddLine("ENDIF;");
+ } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+ AddLine("ELSE;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+ VisitBlock(ast->nodes);
+ } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+ AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+ ResetTemporaries();
+ } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+ const std::string condition = VisitExpression(ast->condition);
+ ResetTemporaries();
+ AddLine("REP;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ AddLine("MOVC.U RC.x, {};", condition);
+ AddLine("BRK (NE.x);");
+ AddLine("ENDREP;");
+ } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+ const bool is_true = ExprIsTrue(ast->condition);
+ if (!is_true) {
+ AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+ AddLine("IF NE.x;");
+ ResetTemporaries();
+ }
+ if (ast->kills) {
+ AddLine("KIL TR;");
+ } else {
+ Exit();
+ }
+ if (!is_true) {
+ AddLine("ENDIF;");
+ }
+ } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+ if (ExprIsTrue(ast->condition)) {
+ AddLine("BRK;");
+ } else {
+ AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+ AddLine("BRK (NE.x);");
+ ResetTemporaries();
+ }
+ } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+ // Nothing to do
+ } else {
+ UNREACHABLE();
+ }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+ if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+ VisitExpression(expr->operand2));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprOr>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+ VisitExpression(expr->operand2));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprNot>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+ return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+ }
+ if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+ return Visit(ir.GetConditionCode(expr->cc));
+ }
+ if (const auto expr = std::get_if<ExprVar>(&*node)) {
+ return fmt::format("F{}.x", expr->var_index);
+ }
+ if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+ return expr->value ? "0xffffffff" : "0";
+ }
+ if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+ return result;
+ }
+ UNREACHABLE();
+ return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+ for (const auto& node : bb) {
+ Visit(node);
+ }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+ if (const auto operation = std::get_if<OperationNode>(&*node)) {
+ if (const auto amend_index = operation->GetAmendIndex()) {
+ Visit(ir.GetAmendNode(*amend_index));
+ }
+ const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+ if (index >= OPERATION_DECOMPILERS.size()) {
+ UNREACHABLE_MSG("Out of bounds operation: {}", index);
+ return {};
+ }
+ const auto decompiler = OPERATION_DECOMPILERS[index];
+ if (decompiler == nullptr) {
+ UNREACHABLE_MSG("Undefined operation: {}", index);
+ return {};
+ }
+ return (this->*decompiler)(*operation);
+ }
+
+ if (const auto gpr = std::get_if<GprNode>(&*node)) {
+ const u32 index = gpr->GetIndex();
+ if (index == Register::ZeroIndex) {
+ return "{0, 0, 0, 0}.x";
+ }
+ return fmt::format("R{}.x", index);
+ }
+
+ if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+ return fmt::format("CV{}.x", cv->GetIndex());
+ }
+
+ if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+ return temporary;
+ }
+
+ if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ switch (const auto index = predicate->GetIndex(); index) {
+ case Tegra::Shader::Pred::UnusedIndex:
+ AddLine("MOV.S {}, -1;", temporary);
+ break;
+ case Tegra::Shader::Pred::NeverExecute:
+ AddLine("MOV.S {}, 0;", temporary);
+ break;
+ default:
+ AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+ break;
+ }
+ if (predicate->IsNegated()) {
+ AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+ }
+ return temporary;
+ }
+
+ if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+ if (abuf->IsPhysicalBuffer()) {
+ UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+ return "{0, 0, 0, 0}.x";
+ }
+
+ const Attribute::Index index = abuf->GetIndex();
+ const u32 element = abuf->GetElement();
+ const char swizzle = Swizzle(element);
+ switch (index) {
+ case Attribute::Index::Position: {
+ if (stage == ShaderType::Geometry) {
+ return fmt::format("{}_position[{}].{}", StageInputName(stage),
+ Visit(abuf->GetBuffer()), swizzle);
+ } else {
+ return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+ }
+ }
+ case Attribute::Index::TessCoordInstanceIDVertexID:
+ ASSERT(stage == ShaderType::Vertex);
+ switch (element) {
+ case 2:
+ return "vertex.instance";
+ case 3:
+ return "vertex.id";
+ }
+ UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+ break;
+ case Attribute::Index::PointCoord:
+ switch (element) {
+ case 0:
+ return "fragment.pointcoord.x";
+ case 1:
+ return "fragment.pointcoord.y";
+ }
+ UNIMPLEMENTED();
+ break;
+ case Attribute::Index::FrontFacing: {
+ ASSERT(stage == ShaderType::Fragment);
+ ASSERT(element == 3);
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+ AddLine("MOV.U.CC RC.x, -RC;");
+ AddLine("MOV.S {}.x, 0;", temporary);
+ AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ default:
+ if (IsGenericAttribute(index)) {
+ if (stage == ShaderType::Geometry) {
+ return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+ Visit(abuf->GetBuffer()), swizzle);
+ } else {
+ return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+ GetGenericAttributeIndex(index), swizzle);
+ }
+ }
+ UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+ break;
+ }
+ return "{0, 0, 0, 0}.x";
+ }
+
+ if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+ std::string offset_string;
+ const auto& offset = cbuf->GetOffset();
+ if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+ offset_string = std::to_string(imm->GetValue());
+ } else {
+ offset_string = Visit(offset);
+ }
+ std::string temporary = AllocTemporary();
+ AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+ return temporary;
+ }
+
+ if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOV {}, 0;", temporary);
+ AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem));
+ return temporary;
+ }
+
+ if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+ std::string temporary = Visit(lmem->GetAddress());
+ AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+ AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+ return temporary;
+ }
+
+ if (const auto smem = std::get_if<SmemNode>(&*node)) {
+ std::string temporary = Visit(smem->GetAddress());
+ AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+ return temporary;
+ }
+
+ if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+ const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+ return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+ }
+
+ if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+ if (const auto amend_index = conditional->GetAmendIndex()) {
+ Visit(ir.GetAmendNode(*amend_index));
+ }
+ AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+ AddLine("IF NE.x;");
+ VisitBlock(conditional->GetCode());
+ AddLine("ENDIF;");
+ return {};
+ }
+
+ if ([[maybe_unused]] const auto cmt = std::get_if<CommentNode>(&*node)) {
+ // Uncommenting this will generate invalid code. GLASM lacks comments.
+ // AddLine("// {}", cmt->GetText());
+ return {};
+ }
+
+ UNIMPLEMENTED();
+ return {};
+}
+
+std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+
+ const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
+ meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
+ const std::size_t count = operation.GetOperandsCount();
+ std::string temporary = AllocVectorTemporary();
+ std::size_t i = 0;
+ for (; i < count; ++i) {
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ if (meta.sampler.is_array) {
+ AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
+ ++i;
+ }
+ if (meta.sampler.is_shadow) {
+ std::string compare = Visit(meta.depth_compare);
+ if (is_extended) {
+ ASSERT(i == 4);
+ std::string extra_coord = AllocVectorTemporary();
+ AddLine("MOV.F {}.x, {};", extra_coord, compare);
+ return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
+ }
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
+ ++i;
+ }
+ return {temporary, temporary, i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ if (meta.aoffi.empty()) {
+ return {};
+ }
+ const std::string temporary = AllocVectorTemporary();
+ std::size_t i = 0;
+ for (auto& node : meta.aoffi) {
+ AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+ }
+ return fmt::format(", offset({})", temporary);
+}
+
+std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
+ // Read a bindless SSBO, return its address and set CC accordingly
+ // address = c[binding].xy
+ // length = c[binding].z
+ const u32 binding = global_memory_names.at(gmem.GetDescriptor());
+
+ const std::string pointer = AllocLongVectorTemporary();
+ std::string temporary = AllocTemporary();
+
+ AddLine("PK64.U {}, c[{}];", pointer, binding);
+ AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
+ Visit(gmem.GetBaseAddress()));
+ AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
+ AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer);
+ // Compare offset to length and set CC
+ AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding);
+ return fmt::format("{}.x", pointer);
+}
+
+void ARBDecompiler::Exit() {
+ if (stage != ShaderType::Fragment) {
+ AddLine("RET;");
+ return;
+ }
+
+ const auto safe_get_register = [this](u32 reg) -> std::string {
+ // TODO(Rodrigo): Replace with contains once C++20 releases
+ const auto& used_registers = ir.GetRegisters();
+ if (used_registers.find(reg) != used_registers.end()) {
+ return fmt::format("R{}.x", reg);
+ }
+ return "{0, 0, 0, 0}.x";
+ };
+
+ const auto& header = ir.GetHeader();
+ u32 current_reg = 0;
+ for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+ for (u32 component = 0; component < 4; ++component) {
+ if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+ continue;
+ }
+ AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+ safe_get_register(current_reg));
+ ++current_reg;
+ }
+ }
+ if (header.ps.omap.depth) {
+ AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+ }
+
+ AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+ const Node& dest = operation[0];
+ const Node& src = operation[1];
+
+ std::string dest_name;
+ if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+ if (gpr->GetIndex() == Register::ZeroIndex) {
+ // Writing to Register::ZeroIndex is a no op
+ return {};
+ }
+ dest_name = fmt::format("R{}.x", gpr->GetIndex());
+ } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+ const u32 element = abuf->GetElement();
+ const char swizzle = Swizzle(element);
+ switch (const Attribute::Index index = abuf->GetIndex()) {
+ case Attribute::Index::Position:
+ dest_name = fmt::format("result.position.{}", swizzle);
+ break;
+ case Attribute::Index::LayerViewportPointSize:
+ switch (element) {
+ case 0:
+ UNIMPLEMENTED();
+ return {};
+ case 1:
+ case 2:
+ if (!device.HasNvViewportArray2()) {
+ LOG_ERROR(
+ Render_OpenGL,
+ "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+ return {};
+ }
+ dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+ break;
+ case 3:
+ dest_name = "result.pointsize.x";
+ break;
+ }
+ break;
+ case Attribute::Index::ClipDistances0123:
+ dest_name = fmt::format("result.clip[{}].x", element);
+ break;
+ case Attribute::Index::ClipDistances4567:
+ dest_name = fmt::format("result.clip[{}].x", element + 4);
+ break;
+ default:
+ if (!IsGenericAttribute(index)) {
+ UNREACHABLE();
+ return {};
+ }
+ dest_name =
+ fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+ break;
+ }
+ } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+ const std::string address = Visit(lmem->GetAddress());
+ AddLine("SHR.U {}, {}, 2;", address, address);
+ dest_name = fmt::format("lmem[{}].x", address);
+ } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+ AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+ ResetTemporaries();
+ return {};
+ } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+ AddLine("IF NE.x;");
+ AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
+ AddLine("ENDIF;");
+ ResetTemporaries();
+ return {};
+ } else {
+ UNREACHABLE();
+ ResetTemporaries();
+ return {};
+ }
+
+ AddLine("MOV.U {}, {};", dest_name, Visit(src));
+ ResetTemporaries();
+ return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+ Visit(operation[2]));
+ return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+ // 1.0f in hex, replace with std::bit_cast on C++20
+ static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+ std::string temporary = AllocTemporary();
+ const Node& value = operation[0];
+ const Node& low = operation[1];
+ const Node& high = operation[2];
+ const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+ const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+ if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+ AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+ } else {
+ AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+ AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+ }
+ return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+ AddLine("MOV {}.x, {}.y;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+ AddLine("RCP.F32 {}, {};", temporary, temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ if (!device.HasWarpIntrinsics()) {
+ LOG_ERROR(Render_OpenGL,
+ "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+ AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+ AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+ AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+ AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+ AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+ AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+ AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ const std::string tmp3 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+ AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+ AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+ AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+ AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+ AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+ AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+ AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+ AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+ AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+ std::string operand = Visit(operation[0]);
+ switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+ case Tegra::Shader::HalfType::H0_H1:
+ return operand;
+ case Tegra::Shader::HalfType::F32: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.U {}.x, {};", temporary, operand);
+ AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ case Tegra::Shader::HalfType::H0_H0: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, operand);
+ AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ case Tegra::Shader::HalfType::H1_H1: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, operand);
+ AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ }
+ UNREACHABLE();
+ return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+ AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+ AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+ AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+ const Node& dest = operation[0];
+ const Node& src = operation[1];
+
+ std::string target;
+
+ if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+ ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+ const Tegra::Shader::Pred index = pred->GetIndex();
+ switch (index) {
+ case Tegra::Shader::Pred::NeverExecute:
+ case Tegra::Shader::Pred::UnusedIndex:
+ // Writing to these predicates is a no-op
+ return {};
+ }
+ target = fmt::format("P{}.x", static_cast<u64>(index));
+ } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+ const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+ target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+ } else {
+ UNREACHABLE();
+ ResetTemporaries();
+ return {};
+ }
+
+ AddLine("MOV.U {}, {};", target, Visit(src));
+ ResetTemporaries();
+ return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+ std::string temporary = AllocTemporary();
+ const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+ AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+ return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+ std::string temporary = AllocTemporary();
+ const std::string op = Visit(operation[0]);
+ AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+ return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+ AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+ AddLine("MOV.S {}, -1;", temporary);
+ AddLine("MOV.S {} (NAN.x), 0;", temporary);
+ AddLine("MOV.S {} (NAN.y), 0;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+ AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("MOV.S {} (NAN.x), -1;", temporary);
+ AddLine("MOV.S {} (NAN.y), -1;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("IF CF.x;");
+ AddLine("MOV.S {}, -1;", temporary);
+ AddLine("ENDIF;");
+ return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+ std::string_view opcode = "TEX";
+ std::string extra;
+ if (meta.bias) {
+ ASSERT(!meta.lod);
+ opcode = "TXB";
+
+ if (swizzle < 4) {
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+ } else {
+ const std::string bias = AllocTemporary();
+ AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+ extra = fmt::format(" {},", bias);
+ }
+ }
+ if (meta.lod) {
+ ASSERT(!meta.bias);
+ opcode = "TXL";
+
+ if (swizzle < 4) {
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+ } else {
+ const std::string lod = AllocTemporary();
+ AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+ extra = fmt::format(" {},", lod);
+ }
+ }
+
+ AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+ std::string comp;
+ if (!meta.sampler.is_shadow) {
+ const auto& immediate = std::get<ImmediateNode>(*meta.component);
+ comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+ }
+
+ AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const std::string temporary = AllocVectorTemporary();
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+ ASSERT(!meta.sampler.is_array);
+
+ const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+ AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const std::string temporary = AllocVectorTemporary();
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+ ASSERT(!meta.sampler.is_array);
+
+ const std::size_t count = operation.GetOperandsCount();
+ for (std::size_t i = 0; i < count; ++i) {
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+ AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+ AddLine("TRUNC.S {}, {};", temporary, temporary);
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [coords, temporary, swizzle] = BuildCoords(operation);
+
+ if (!meta.sampler.is_buffer) {
+ ASSERT(swizzle < 4);
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+ }
+ AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
+ BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const std::string ddx = AllocVectorTemporary();
+ const std::string ddy = AllocVectorTemporary();
+ const std::string coord = std::get<1>(BuildCoords(operation));
+
+ const std::size_t num_components = meta.derivates.size() / 2;
+ for (std::size_t index = 0; index < num_components; ++index) {
+ const char swizzle = Swizzle(index);
+ AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+ AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+ }
+
+ const std::string_view result = coord;
+ AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+ return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t count = operation.GetOperandsCount();
+ const std::string_view type = ImageType(meta.image.type);
+
+ const std::string temporary = AllocVectorTemporary();
+ for (std::size_t i = 0; i < count; ++i) {
+ AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+ AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t num_coords = operation.GetOperandsCount();
+ const std::size_t num_values = meta.values.size();
+ const std::string_view type = ImageType(meta.image.type);
+
+ const std::string coord = AllocVectorTemporary();
+ const std::string value = AllocVectorTemporary();
+ for (std::size_t i = 0; i < num_coords; ++i) {
+ AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+ }
+ for (std::size_t i = 0; i < num_values; ++i) {
+ AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+ }
+ AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+ return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+ const auto target = std::get<ImmediateNode>(*operation[0]);
+ AddLine("MOV.U PC.x, {};", target.GetValue());
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+ AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+ const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+ const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+ const std::string_view stack_name = StackName(stack);
+ AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+ AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+ return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+ const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+ const std::string_view stack_name = StackName(stack);
+ AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+ AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+ Exit();
+ return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+ AddLine("KIL TR;");
+ return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+ AddLine("EMIT;");
+ return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+ AddLine("ENDPRIM;");
+ return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+ return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+ LOG_WARNING(Render_OpenGL, "(STUBBED)");
+ std::string temporary = AllocTemporary();
+ AddLine("MOV.F {}, 1;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+ return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+ if (!device.HasWarpIntrinsics()) {
+ LOG_ERROR(Render_OpenGL,
+ "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+ return Visit(operation[0]);
+ }
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+ Visit(operation[1]));
+ AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+ AddLine("BAR;");
+ return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+ AddLine("MEMBAR.CTA;");
+ return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+ AddLine("MEMBAR;");
+ return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ const VideoCommon::Shader::Registry& registry,
+ Tegra::Engines::ShaderType stage, std::string_view identifier) {
+ return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ const VideoCommon::Shader::Registry& registry,
+ Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 4eb37a96c..b1c4cd62f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@
#include "common/assert.h"
#include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -21,22 +22,54 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
: VideoCommon::BufferBlock{cpu_addr, size} {
gl_buffer.Create();
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+ if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
+ glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+ glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+ }
}
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
+ glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+ data);
+}
-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
- const Device& device, std::size_t stream_size)
- : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
+ MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+ const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+ const GLintptr gl_offset = static_cast<GLintptr>(offset);
+ if (read_buffer.handle == 0) {
+ read_buffer.Create();
+ glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+ GL_STREAM_READ);
+ }
+ glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+ glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+ glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size) {
+ glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+ static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
+
+OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+ Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
+ const Device& device_, std::size_t stream_size)
+ : GenericBufferCache{rasterizer, gpu_memory, cpu_memory,
+ std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+ device{device_} {
if (!device.HasFastBufferSubData()) {
return;
}
- static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+ static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
for (const GLuint cbuf : cbufs) {
glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -47,49 +80,21 @@ OGLBufferCache::~OGLBufferCache() {
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
}
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
- return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-void OGLBufferCache::WriteBarrier() {
- glMemoryBarrier(GL_ALL_BARRIER_BITS);
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+ return std::make_shared<Buffer>(device, cpu_addr, size);
}
-const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
- return buffer->GetHandle();
-}
-
-const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
- static const GLuint null_buffer = 0;
- return &null_buffer;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) {
- glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) {
- MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
- glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) {
- glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
- static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
- static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+ return {0, 0, 0};
}
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
std::size_t size) {
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
- const GLuint& cbuf = cbufs[cbuf_cursor++];
+ const GLuint cbuf = cbufs[cbuf_cursor++];
+
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
- return {&cbuf, 0};
+ return {cbuf, 0, 0};
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index d94a11252..f75b32e31 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -24,59 +23,59 @@ class Device;
class OGLStreamBuffer;
class RasterizerOpenGL;
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+ explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+ ~Buffer();
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+ void Upload(std::size_t offset, std::size_t size, const u8* data);
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
- explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
- ~CachedBufferBlock();
+ void Download(std::size_t offset, std::size_t size, u8* data);
+
+ void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size);
- const GLuint* GetHandle() const {
- return &gl_buffer.handle;
+ GLuint Handle() const noexcept {
+ return gl_buffer.handle;
+ }
+
+ u64 Address() const noexcept {
+ return gpu_address;
}
private:
- OGLBuffer gl_buffer{};
+ OGLBuffer gl_buffer;
+ OGLBuffer read_buffer;
+ u64 gpu_address = 0;
};
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
class OGLBufferCache final : public GenericBufferCache {
public:
- explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+ explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+ Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
const Device& device, std::size_t stream_size);
~OGLBufferCache();
- const GLuint* GetEmptyBuffer(std::size_t) override;
+ BufferInfo GetEmptyBuffer(std::size_t) override;
void Acquire() noexcept {
cbuf_cursor = 0;
}
protected:
- Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
- void WriteBarrier() override;
-
- const GLuint* ToHandle(const Buffer& buffer) override;
-
- void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) override;
-
- void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) override;
-
- void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) override;
+ std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
private:
+ static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+ Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+ const Device& device;
+
std::size_t cbuf_cursor = 0;
- std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
- Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
- cbufs;
+ std::array<GLuint, NUM_CBUFS> cbufs{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index c286502ba..a94e4f72e 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
#include <array>
#include <cstddef>
#include <cstring>
+#include <limits>
#include <optional>
#include <vector>
@@ -13,6 +14,7 @@
#include "common/logging/log.h"
#include "common/scope_exit.h"
+#include "core/settings.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
constexpr u32 NumStages = 5;
-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
- GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
- GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+ GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+ GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+ GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
constexpr std::array LimitSSBOs = {
- GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+ GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
- GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+ GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
-constexpr std::array LimitSamplers = {
- GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
- GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
- GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+ GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TEXTURE_IMAGE_UNITS,
+ GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
- GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
- GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
- GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+ GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+ GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+ GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
template <typename T>
T GetInteger(GLenum pname) {
@@ -84,10 +89,17 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
return std::exchange(base, base + amount);
}
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+ std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+ std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+ [](GLenum pname) { return GetInteger<u32>(pname); });
+ return max;
+}
+
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
- static std::array<std::size_t, 5> stage_swizzle = {0, 1, 2, 3, 4};
+ static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4};
const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS);
const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS);
const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS);
@@ -111,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
u32 base_images = 0;
- // Reserve more image bindings on fragment and vertex stages.
+ // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+ // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+ // fragment stage, and at least 1 for the rest of the stages.
+ // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+ // Reserve at least 4 image bindings on the fragment stage.
bindings[4].image =
- Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
- bindings[0].image =
- Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+ Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+ // This is guaranteed to be at least 1.
+ const u32 total_extracted_images = num_images / (NumStages - 1);
// Reserve the other image bindings.
- const u32 total_extracted_images = num_images / (NumStages - 2);
- for (std::size_t i = 2; i < NumStages; ++i) {
+ for (std::size_t i = 0; i < NumStages; ++i) {
const std::size_t stage = stage_swizzle[i];
+ if (stage == 4) {
+ continue;
+ }
bindings[stage].image =
Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
}
@@ -132,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
}
bool IsASTCSupported() {
+ static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
static constexpr std::array formats = {
GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -148,59 +169,94 @@ bool IsASTCSupported() {
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
};
- return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
- GLint supported;
- glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
- &supported);
- return supported == GL_TRUE;
- }) == formats.end();
+ static constexpr std::array required_support = {
+ GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+ GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE,
+ };
+
+ for (const GLenum target : targets) {
+ for (const GLenum format : formats) {
+ for (const GLenum support : required_support) {
+ GLint value;
+ glGetInternalformativ(target, format, support, 1, &value);
+ if (value != GL_FULL_SUPPORT) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
}
} // Anonymous namespace
-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+ : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
- const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+ const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
- const bool is_intel = vendor == "Intel";
- const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+ bool disable_fast_buffer_sub_data = false;
+ if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+ LOG_WARNING(
+ Render_OpenGL,
+ "Beta driver 443.24 is known to have issues. There might be performance issues.");
+ disable_fast_buffer_sub_data = true;
+ }
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+ max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
GLAD_GL_NV_shader_thread_shuffle;
has_shader_ballot = GLAD_GL_ARB_shader_ballot;
has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+ has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
has_astc = IsASTCSupported();
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
- has_broken_compute = is_intel_proprietary;
- has_fast_buffer_sub_data = is_nvidia;
+ has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+ has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+ // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+ // uniform buffers as "push constants"
+ has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+ use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
+ GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
+ GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
+
+ use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+ if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
+ LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+ }
}
Device::Device(std::nullptr_t) {
- uniform_buffer_alignment = 0;
+ max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+ uniform_buffer_alignment = 4;
+ shader_storage_alignment = 4;
max_vertex_attributes = 16;
max_varyings = 15;
+ max_compute_shared_memory_size = 0x10000;
has_warp_intrinsics = true;
has_shader_ballot = true;
has_vertex_viewport_layer = true;
has_image_load_formatted = true;
+ has_texture_shadow_lod = true;
has_variable_aoffi = true;
- has_component_indexing_bug = false;
- has_broken_compute = false;
- has_precise_bug = false;
}
bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index a55050cb5..8a4b6b9fc 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
explicit Device();
explicit Device(std::nullptr_t);
+ u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+ return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+ }
+
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
return base_bindings[stage_index];
}
@@ -48,6 +52,10 @@ public:
return max_varyings;
}
+ u32 GetMaxComputeSharedMemorySize() const {
+ return max_compute_shared_memory_size;
+ }
+
bool HasWarpIntrinsics() const {
return has_warp_intrinsics;
}
@@ -64,6 +72,14 @@ public:
return has_image_load_formatted;
}
+ bool HasTextureShadowLod() const {
+ return has_texture_shadow_lod;
+ }
+
+ bool HasVertexBufferUnifiedMemory() const {
+ return has_vertex_buffer_unified_memory;
+ }
+
bool HasASTC() const {
return has_astc;
}
@@ -80,33 +96,47 @@ public:
return has_precise_bug;
}
- bool HasBrokenCompute() const {
- return has_broken_compute;
- }
-
bool HasFastBufferSubData() const {
return has_fast_buffer_sub_data;
}
+ bool HasNvViewportArray2() const {
+ return has_nv_viewport_array2;
+ }
+
+ bool UseAssemblyShaders() const {
+ return use_assembly_shaders;
+ }
+
+ bool UseAsynchronousShaders() const {
+ return use_asynchronous_shaders;
+ }
+
private:
static bool TestVariableAoffi();
static bool TestPreciseBug();
- std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+ std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+ std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
std::size_t uniform_buffer_alignment{};
std::size_t shader_storage_alignment{};
u32 max_vertex_attributes{};
u32 max_varyings{};
+ u32 max_compute_shared_memory_size{};
bool has_warp_intrinsics{};
bool has_shader_ballot{};
bool has_vertex_viewport_layer{};
bool has_image_load_formatted{};
+ bool has_texture_shadow_lod{};
+ bool has_vertex_buffer_unified_memory{};
bool has_astc{};
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
- bool has_broken_compute{};
bool has_fast_buffer_sub_data{};
+ bool has_nv_viewport_array2{};
+ bool use_assembly_shaders{};
+ bool use_asynchronous_shaders{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
new file mode 100644
index 000000000..b532fdcc2
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -0,0 +1,73 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+
+#include <glad/glad.h>
+
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_fence_manager.h"
+
+namespace OpenGL {
+
+GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {}
+
+GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed)
+ : FenceBase(address, payload, is_stubbed) {}
+
+GLInnerFence::~GLInnerFence() = default;
+
+void GLInnerFence::Queue() {
+ if (is_stubbed) {
+ return;
+ }
+ ASSERT(sync_object.handle == 0);
+ sync_object.Create();
+}
+
+bool GLInnerFence::IsSignaled() const {
+ if (is_stubbed) {
+ return true;
+ }
+ ASSERT(sync_object.handle != 0);
+ GLsizei length;
+ GLint sync_status;
+ glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status);
+ return sync_status == GL_SIGNALED;
+}
+
+void GLInnerFence::Wait() {
+ if (is_stubbed) {
+ return;
+ }
+ ASSERT(sync_object.handle != 0);
+ glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
+}
+
+FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+ TextureCacheOpenGL& texture_cache,
+ OGLBufferCache& buffer_cache, QueryCache& query_cache)
+ : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {}
+
+Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
+ return std::make_shared<GLInnerFence>(value, is_stubbed);
+}
+
+Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) {
+ return std::make_shared<GLInnerFence>(addr, value, is_stubbed);
+}
+
+void FenceManagerOpenGL::QueueFence(Fence& fence) {
+ fence->Queue();
+}
+
+bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const {
+ return fence->IsSignaled();
+}
+
+void FenceManagerOpenGL::WaitFence(Fence& fence) {
+ fence->Wait();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h
new file mode 100644
index 000000000..da1dcdace
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -0,0 +1,52 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+#include "common/common_types.h"
+#include "video_core/fence_manager.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_texture_cache.h"
+
+namespace OpenGL {
+
+class GLInnerFence : public VideoCommon::FenceBase {
+public:
+ GLInnerFence(u32 payload, bool is_stubbed);
+ GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed);
+ ~GLInnerFence();
+
+ void Queue();
+
+ bool IsSignaled() const;
+
+ void Wait();
+
+private:
+ OGLSync sync_object;
+};
+
+using Fence = std::shared_ptr<GLInnerFence>;
+using GenericFenceManager =
+ VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>;
+
+class FenceManagerOpenGL final : public GenericFenceManager {
+public:
+ explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+ TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
+ QueryCache& query_cache);
+
+protected:
+ Fence CreateFence(u32 value, bool is_stubbed) override;
+ Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override;
+ void QueueFence(Fence& fence) override;
+ bool IsFenceSignaled(Fence& fence) const override;
+ void WaitFence(Fence& fence) override;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
index f12e9f55f..1a3d9720e 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -30,12 +30,11 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
} // Anonymous namespace
-QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
- : VideoCommon::QueryCacheBase<
- QueryCache, CachedQuery, CounterStream, HostCounter,
- std::vector<OGLQuery>>{system,
- static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
- gl_rasterizer{gl_rasterizer} {}
+QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+ Tegra::MemoryManager& gpu_memory)
+ : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>(
+ rasterizer, maxwell3d, gpu_memory),
+ gl_rasterizer{rasterizer} {}
QueryCache::~QueryCache() = default;
@@ -90,13 +89,15 @@ u64 HostCounter::BlockingQuery() const {
CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
: VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+CachedQuery::~CachedQuery() = default;
+
CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
: VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
- VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
cache = rhs.cache;
type = rhs.type;
+ CachedQueryBase<HostCounter>::operator=(std::move(rhs));
return *this;
}
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
index d8e7052a1..82cac51ee 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.h
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -26,10 +26,11 @@ class RasterizerOpenGL;
using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
-class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
- HostCounter, std::vector<OGLQuery>> {
+class QueryCache final
+ : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
public:
- explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+ explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
+ Tegra::MemoryManager& gpu_memory);
~QueryCache();
OGLQuery AllocateQuery(VideoCore::QueryType type);
@@ -40,6 +41,7 @@ public:
private:
RasterizerOpenGL& gl_rasterizer;
+ std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools;
};
class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
@@ -62,10 +64,12 @@ class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
public:
explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
u8* host_ptr);
- CachedQuery(CachedQuery&& rhs) noexcept;
- CachedQuery(const CachedQuery&) = delete;
+ ~CachedQuery() override;
+ CachedQuery(CachedQuery&& rhs) noexcept;
CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+
+ CachedQuery(const CachedQuery&) = delete;
CachedQuery& operator=(const CachedQuery&) = delete;
void Flush() override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f4598fbf7..cfddbde5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
namespace OpenGL {
@@ -54,19 +55,36 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
namespace {
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+ NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+ NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
template <typename Engine, typename Entry>
Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
ShaderType shader_type, std::size_t index = 0) {
- if (entry.IsBindless()) {
- const Tegra::Texture::TextureHandle tex_handle =
- engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
- return engine.GetTextureInfo(tex_handle);
+ if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+ if (entry.is_separated) {
+ const u32 buffer_1 = entry.buffer;
+ const u32 buffer_2 = entry.secondary_buffer;
+ const u32 offset_1 = entry.offset;
+ const u32 offset_2 = entry.secondary_offset;
+ const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+ const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+ return engine.GetTextureInfo(handle_1 | handle_2);
+ }
+ }
+ if (entry.is_bindless) {
+ const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+ return engine.GetTextureInfo(handle);
}
+
const auto& gpu_profile = engine.AccessGuestDriverProfile();
- const u32 offset =
- entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
+ const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
return engine.GetStageTexture(shader_type, offset);
} else {
@@ -89,23 +107,84 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
return buffer.size;
}
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+ const u8 index = location / 4;
+ if (index >= 8 && index <= 39) {
+ return {GL_GENERIC_ATTRIB_NV, index - 8};
+ }
+ if (index >= 48 && index <= 55) {
+ return {GL_TEXTURE_COORD_NV, index - 48};
+ }
+ switch (index) {
+ case 7:
+ return {GL_POSITION, 0};
+ case 40:
+ return {GL_PRIMARY_COLOR_NV, 0};
+ case 41:
+ return {GL_SECONDARY_COLOR_NV, 0};
+ case 42:
+ return {GL_BACK_PRIMARY_COLOR_NV, 0};
+ case 43:
+ return {GL_BACK_SECONDARY_COLOR_NV, 0};
+ }
+ UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+ return {GL_POSITION, 0};
+}
+
void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
+void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
+ if (num_ssbos == 0) {
+ return;
+ }
+ glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
+ reinterpret_cast<const GLuint*>(ssbos));
+}
+
} // Anonymous namespace
-RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
- ScreenInfo& info, GLShader::ProgramManager& program_manager,
- StateTracker& state_tracker)
- : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
- shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
- screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker},
- buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
+ Core::Memory::Memory& cpu_memory, const Device& device_,
+ ScreenInfo& screen_info_, ProgramManager& program_manager_,
+ StateTracker& state_tracker_)
+ : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
+ kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
+ screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
+ texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker),
+ shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device),
+ query_cache(*this, maxwell3d, gpu_memory),
+ buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE),
+ fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
+ async_shaders(emu_window) {
CheckExtensions();
+
+ unified_uniform_buffer.Create();
+ glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
+ if (device.UseAssemblyShaders()) {
+ glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+ for (const GLuint cbuf : staging_cbufs) {
+ glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
+ nullptr, 0);
+ }
+ }
+
+ if (device.UseAsynchronousShaders()) {
+ async_shaders.AllocateWorkers();
+ }
}
-RasterizerOpenGL::~RasterizerOpenGL() {}
+RasterizerOpenGL::~RasterizerOpenGL() {
+ if (device.UseAssemblyShaders()) {
+ glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+ }
+}
void RasterizerOpenGL::CheckExtensions() {
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -116,8 +195,7 @@ void RasterizerOpenGL::CheckExtensions() {
}
void RasterizerOpenGL::SetupVertexFormat() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexFormats]) {
return;
}
@@ -131,13 +209,13 @@ void RasterizerOpenGL::SetupVertexFormat() {
// avoid OpenGL errors.
// TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
// assume every shader uses them all.
- for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
if (!flags[Dirty::VertexFormat0 + index]) {
continue;
}
flags[Dirty::VertexFormat0 + index] = false;
- const auto attrib = gpu.regs.vertex_attrib_format[index];
+ const auto attrib = maxwell3d.regs.vertex_attrib_format[index];
const auto gl_index = static_cast<GLuint>(index);
// Disable constant attributes.
@@ -150,9 +228,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
- MaxwellToGL::VertexType(attrib), attrib.offset);
+ MaxwellToGL::VertexFormat(attrib), attrib.offset);
} else {
- glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+ glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+ MaxwellToGL::VertexFormat(attrib),
attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
}
glVertexAttribBinding(gl_index, attrib.buffer);
@@ -160,8 +239,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
}
void RasterizerOpenGL::SetupVertexBuffer() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexBuffers]) {
return;
}
@@ -169,9 +247,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
MICROPROFILE_SCOPE(OpenGL_VB);
+ const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
// Upload all guest vertex arrays sequentially to our buffer
- const auto& regs = gpu.regs;
- for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+ const auto& regs = maxwell3d.regs;
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
if (!flags[Dirty::VertexBuffer0 + index]) {
continue;
}
@@ -184,27 +264,37 @@ void RasterizerOpenGL::SetupVertexBuffer() {
const GPUVAddr start = vertex_array.StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
- ASSERT(end > start);
- const u64 size = end - start + 1;
- const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-
- // Bind the vertex array to the buffer at the current offset.
- vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer,
- vertex_buffer_offset, vertex_array.stride);
+ ASSERT(end >= start);
+
+ const GLuint gl_index = static_cast<GLuint>(index);
+ const u64 size = end - start;
+ if (size == 0) {
+ glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+ if (use_unified_memory) {
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+ }
+ continue;
+ }
+ const auto info = buffer_cache.UploadMemory(start, size);
+ if (use_unified_memory) {
+ glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+ info.address + info.offset, size);
+ } else {
+ glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+ }
}
}
void RasterizerOpenGL::SetupVertexInstances() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexInstances]) {
return;
}
flags[Dirty::VertexInstances] = false;
- const auto& regs = gpu.regs;
- for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+ const auto& regs = maxwell3d.regs;
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
if (!flags[Dirty::VertexInstance0 + index]) {
continue;
}
@@ -219,24 +309,23 @@ void RasterizerOpenGL::SetupVertexInstances() {
GLintptr RasterizerOpenGL::SetupIndexBuffer() {
MICROPROFILE_SCOPE(OpenGL_Index);
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& regs = maxwell3d.regs;
const std::size_t size = CalculateIndexBufferSize();
- const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
- vertex_array_pushbuffer.SetIndexBuffer(buffer);
- return offset;
+ const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+ glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+ return info.offset;
}
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
MICROPROFILE_SCOPE(OpenGL_Shader);
- auto& gpu = system.GPU().Maxwell3D();
u32 clip_distances = 0;
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
- const auto& shader_config = gpu.regs.shader_config[index];
+ const auto& shader_config = maxwell3d.regs.shader_config[index];
const auto program{static_cast<Maxwell::ShaderProgram>(index)};
// Skip stages that are not enabled
- if (!gpu.regs.IsShaderConfigEnabled(index)) {
+ if (!maxwell3d.regs.IsShaderConfigEnabled(index)) {
switch (program) {
case Maxwell::ShaderProgram::Geometry:
program_manager.UseGeometryShader(0);
@@ -251,23 +340,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
}
// Currently this stages are not supported in the OpenGL backend.
- // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
- if (program == Maxwell::ShaderProgram::TesselationControl) {
- continue;
- } else if (program == Maxwell::ShaderProgram::TesselationEval) {
+ // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
+ if (program == Maxwell::ShaderProgram::TesselationControl ||
+ program == Maxwell::ShaderProgram::TesselationEval) {
continue;
}
- Shader shader{shader_cache.GetStageProgram(program)};
+ Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
- // Stage indices are 0 - 5
- const std::size_t stage = index == 0 ? 0 : index - 1;
- SetupDrawConstBuffers(stage, shader);
- SetupDrawGlobalMemory(stage, shader);
- SetupDrawTextures(stage, shader);
- SetupDrawImages(stage, shader);
-
- const GLuint program_handle = shader->GetHandle();
+ const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
switch (program) {
case Maxwell::ShaderProgram::VertexA:
case Maxwell::ShaderProgram::VertexB:
@@ -284,6 +365,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
shader_config.enable.Value(), shader_config.offset);
}
+ // Stage indices are 0 - 5
+ const std::size_t stage = index == 0 ? 0 : index - 1;
+ SetupDrawConstBuffers(stage, shader);
+ SetupDrawGlobalMemory(stage, shader);
+ SetupDrawTextures(stage, shader);
+ SetupDrawImages(stage, shader);
+
// Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen
// (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -298,11 +386,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
}
SyncClipEnabled(clip_distances);
- gpu.dirty.flags[Dirty::Shaders] = false;
+ maxwell3d.dirty.flags[Dirty::Shaders] = false;
}
std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& regs = maxwell3d.regs;
std::size_t size = 0;
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
@@ -312,49 +400,42 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
const GPUVAddr start = regs.vertex_array[index].StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
- ASSERT(end > start);
- size += end - start + 1;
+ size += end - start;
+ ASSERT(end >= start);
}
return size;
}
std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
- const auto& regs = system.GPU().Maxwell3D().regs;
-
- return static_cast<std::size_t>(regs.index_array.count) *
- static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+ return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
+ static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
}
-void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
+void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) {
- shader_cache.LoadDiskCache(stop_loading, callback);
-}
-
-void RasterizerOpenGL::SetupDirtyFlags() {
- state_tracker.Initialize();
+ shader_cache.LoadDiskCache(title_id, stop_loading, callback);
}
void RasterizerOpenGL::ConfigureFramebuffers() {
MICROPROFILE_SCOPE(OpenGL_Framebuffer);
- auto& gpu = system.GPU().Maxwell3D();
- if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
+ if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
return;
}
- gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
+ maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
texture_cache.GuardRenderTargets(true);
- View depth_surface = texture_cache.GetDepthBufferSurface();
+ View depth_surface = texture_cache.GetDepthBufferSurface(true);
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
// Bind the framebuffer surfaces
FramebufferCacheKey key;
const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
for (std::size_t index = 0; index < colors_count; ++index) {
- View color_surface{texture_cache.GetColorBufferSurface(index)};
+ View color_surface{texture_cache.GetColorBufferSurface(index, true)};
if (!color_surface) {
continue;
}
@@ -378,40 +459,62 @@ void RasterizerOpenGL::ConfigureFramebuffers() {
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
}
-void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb,
- bool using_stencil_fb) {
- auto& gpu = system.GPU().Maxwell3D();
- const auto& regs = gpu.regs;
+void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) {
+ const auto& regs = maxwell3d.regs;
texture_cache.GuardRenderTargets(true);
View color_surface;
- if (using_color_fb) {
+
+ if (using_color) {
+ // Determine if we have to preserve the contents.
+ // First we have to make sure all clear masks are enabled.
+ bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G ||
+ !regs.clear_buffers.B || !regs.clear_buffers.A;
const std::size_t index = regs.clear_buffers.RT;
- color_surface = texture_cache.GetColorBufferSurface(index);
+ if (regs.clear_flags.scissor) {
+ // Then we have to confirm scissor testing clears the whole image.
+ const auto& scissor = regs.scissor_test[0];
+ preserve_contents |= scissor.min_x > 0;
+ preserve_contents |= scissor.min_y > 0;
+ preserve_contents |= scissor.max_x < regs.rt[index].width;
+ preserve_contents |= scissor.max_y < regs.rt[index].height;
+ }
+
+ color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents);
texture_cache.MarkColorBufferInUse(index);
}
+
View depth_surface;
- if (using_depth_fb || using_stencil_fb) {
- depth_surface = texture_cache.GetDepthBufferSurface();
+ if (using_depth_stencil) {
+ bool preserve_contents = false;
+ if (regs.clear_flags.scissor) {
+ // For depth stencil clears we only have to confirm scissor test covers the whole image.
+ const auto& scissor = regs.scissor_test[0];
+ preserve_contents |= scissor.min_x > 0;
+ preserve_contents |= scissor.min_y > 0;
+ preserve_contents |= scissor.max_x < regs.zeta_width;
+ preserve_contents |= scissor.max_y < regs.zeta_height;
+ }
+
+ depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents);
texture_cache.MarkDepthBufferInUse();
}
texture_cache.GuardRenderTargets(false);
FramebufferCacheKey key;
- key.colors[0] = color_surface;
- key.zeta = depth_surface;
+ key.colors[0] = std::move(color_surface);
+ key.zeta = std::move(depth_surface);
state_tracker.NotifyFramebuffer();
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
}
void RasterizerOpenGL::Clear() {
- const auto& gpu = system.GPU().Maxwell3D();
- if (!gpu.ShouldExecute()) {
+ if (!maxwell3d.ShouldExecute()) {
return;
}
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
bool use_color{};
bool use_depth{};
bool use_stencil{};
@@ -419,8 +522,7 @@ void RasterizerOpenGL::Clear() {
if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
regs.clear_buffers.A) {
use_color = true;
- }
- if (use_color) {
+
state_tracker.NotifyColorMask0();
glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
regs.clear_buffers.B != 0, regs.clear_buffers.A != 0);
@@ -458,7 +560,7 @@ void RasterizerOpenGL::Clear() {
UNIMPLEMENTED_IF(regs.clear_flags.viewport);
- ConfigureClearFramebuffer(use_color, use_depth, use_stencil);
+ ConfigureClearFramebuffer(use_color, use_depth || use_stencil);
if (use_color) {
glClearBufferfv(GL_COLOR, 0, regs.clear_color);
@@ -477,7 +579,6 @@ void RasterizerOpenGL::Clear() {
void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
MICROPROFILE_SCOPE(OpenGL_Drawing);
- auto& gpu = system.GPU().Maxwell3D();
query_cache.UpdateCounters();
@@ -502,6 +603,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
SyncFramebufferSRGB();
buffer_cache.Acquire();
+ current_cbuf = 0;
std::size_t buffer_size = CalculateVertexArraysSize();
@@ -511,20 +613,28 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
// Uniform space for the 5 shader stages
- buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
- (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
- Maxwell::MaxShaderStage;
+ buffer_size =
+ Common::AlignUp<std::size_t>(buffer_size, 4) +
+ (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
// Add space for at least 18 constant buffers
buffer_size += Maxwell::MaxConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
// Prepare the vertex array.
- buffer_cache.Map(buffer_size);
+ const bool invalidated = buffer_cache.Map(buffer_size);
+
+ if (invalidated) {
+ // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+ auto& dirty = maxwell3d.dirty.flags;
+ dirty[Dirty::VertexBuffers] = true;
+ for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+ dirty[index] = true;
+ }
+ }
// Prepare vertex array format.
SetupVertexFormat();
- vertex_array_pushbuffer.Setup();
// Upload vertex and index data.
SetupVertexBuffer();
@@ -534,21 +644,19 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
index_buffer_offset = SetupIndexBuffer();
}
- // Prepare packed bindings.
- bind_ubo_pushbuffer.Setup();
- bind_ssbo_pushbuffer.Setup();
-
// Setup emulation uniform buffer.
- GLShader::MaxwellUniformData ubo;
- ubo.SetFromRegs(gpu);
- const auto [buffer, offset] =
- buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
- bind_ubo_pushbuffer.Push(EmulationUniformBlockBinding, buffer, offset,
- static_cast<GLsizeiptr>(sizeof(ubo)));
+ if (!device.UseAssemblyShaders()) {
+ MaxwellUniformData ubo;
+ ubo.SetFromRegs(maxwell3d);
+ const auto info =
+ buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
+ glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
+ static_cast<GLsizeiptr>(sizeof(ubo)));
+ }
// Setup shaders and their used resources.
texture_cache.GuardSamplers(true);
- const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+ const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
SetupShaders(primitive_mode);
texture_cache.GuardSamplers(false);
@@ -557,11 +665,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
// Signal the buffer cache that we are not going to upload more things.
buffer_cache.Unmap();
- // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
- vertex_array_pushbuffer.Bind();
- bind_ubo_pushbuffer.Bind();
- bind_ssbo_pushbuffer.Bind();
-
program_manager.BindGraphicsPipeline();
if (texture_cache.TextureBarrier()) {
@@ -570,14 +673,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
BeginTransformFeedback(primitive_mode);
- const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
+ const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance);
const GLsizei num_instances =
- static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
+ static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1);
if (is_indexed) {
- const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
- const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
+ const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
+ const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
- const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+ const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
glDrawElements(primitive_mode, num_vertices, format, offset);
} else if (num_instances == 1 && base_instance == 0) {
@@ -596,8 +699,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
base_instance);
}
} else {
- const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
- const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
+ const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first);
+ const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count);
if (num_instances == 1 && base_instance == 0) {
glDrawArrays(primitive_mode, base_vertex, num_vertices);
} else if (base_instance == 0) {
@@ -611,37 +714,32 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
EndTransformFeedback();
++num_queued_commands;
+
+ gpu.TickWork();
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
- if (device.HasBrokenCompute()) {
- return;
- }
-
buffer_cache.Acquire();
+ current_cbuf = 0;
auto kernel = shader_cache.GetComputeKernel(code_addr);
+ program_manager.BindCompute(kernel->GetHandle());
+
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
- program_manager.BindComputeShader(kernel->GetHandle());
const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
buffer_cache.Map(buffer_size);
- bind_ubo_pushbuffer.Setup();
- bind_ssbo_pushbuffer.Setup();
-
SetupComputeConstBuffers(kernel);
SetupComputeGlobalMemory(kernel);
buffer_cache.Unmap();
- bind_ubo_pushbuffer.Bind();
- bind_ssbo_pushbuffer.Bind();
-
- const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+ const auto& launch_desc = kepler_compute.launch_description;
+ program_manager.BindCompute(kernel->GetHandle());
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands;
}
@@ -667,6 +765,13 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
query_cache.FlushRegion(addr, size);
}
+bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
+ if (!Settings::IsGPULevelHigh()) {
+ return buffer_cache.MustFlushRegion(addr, size);
+ }
+ return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size);
+}
+
void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
if (addr == 0 || size == 0) {
@@ -678,13 +783,64 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
query_cache.InvalidateRegion(addr, size);
}
+void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
+ MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+ if (addr == 0 || size == 0) {
+ return;
+ }
+ texture_cache.OnCPUWrite(addr, size);
+ shader_cache.OnCPUWrite(addr, size);
+ buffer_cache.OnCPUWrite(addr, size);
+}
+
+void RasterizerOpenGL::SyncGuestHost() {
+ MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+ texture_cache.SyncGuestHost();
+ buffer_cache.SyncGuestHost();
+ shader_cache.SyncGuestHost();
+}
+
+void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
+ if (!gpu.IsAsync()) {
+ gpu_memory.Write<u32>(addr, value);
+ return;
+ }
+ fence_manager.SignalSemaphore(addr, value);
+}
+
+void RasterizerOpenGL::SignalSyncPoint(u32 value) {
+ if (!gpu.IsAsync()) {
+ gpu.IncrementSyncPoint(value);
+ return;
+ }
+ fence_manager.SignalSyncPoint(value);
+}
+
+void RasterizerOpenGL::ReleaseFences() {
+ if (!gpu.IsAsync()) {
+ return;
+ }
+ fence_manager.WaitPendingFences();
+}
+
void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
- if (Settings::values.use_accurate_gpu_emulation) {
+ if (Settings::IsGPULevelExtreme()) {
FlushRegion(addr, size);
}
InvalidateRegion(addr, size);
}
+void RasterizerOpenGL::WaitForIdle() {
+ // Place a barrier on everything that is not framebuffer related.
+ // This is related to another flag that is not currently implemented.
+ glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
+ GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
+ GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
+ GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
+ GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
+ GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
+}
+
void RasterizerOpenGL::FlushCommands() {
// Only flush when we have commands queued to OpenGL.
if (num_queued_commands == 0) {
@@ -739,40 +895,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
return true;
}
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
+ static constexpr std::array PARAMETER_LUT{
+ GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+ GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+ GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+ };
MICROPROFILE_SCOPE(OpenGL_UBO);
- const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+ const auto& stages = maxwell3d.state.shader_stages;
const auto& shader_stage = stages[stage_index];
-
- u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
- for (const auto& entry : shader->GetEntries().const_buffers) {
- const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
- SetupConstBuffer(binding++, buffer, entry);
+ const auto& entries = shader->GetEntries();
+ const bool use_unified = entries.use_unified_uniforms;
+ const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
+
+ const auto base_bindings = device.GetBaseBindings(stage_index);
+ u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+ for (const auto& entry : entries.const_buffers) {
+ const u32 index = entry.GetIndex();
+ const auto& buffer = shader_stage.const_buffers[index];
+ SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+ base_unified_offset + index * Maxwell::MaxConstBufferSize);
+ ++binding;
+ }
+ if (use_unified) {
+ const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+ entries.global_memory_entries.size());
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+ base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
- const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+ const auto& launch_desc = kepler_compute.launch_description;
+ const auto& entries = kernel->GetEntries();
+ const bool use_unified = entries.use_unified_uniforms;
u32 binding = 0;
- for (const auto& entry : kernel->GetEntries().const_buffers) {
+ for (const auto& entry : entries.const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
- SetupConstBuffer(binding++, buffer, entry);
+ SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+ use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+ ++binding;
+ }
+ if (use_unified) {
+ const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+ NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
-void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry) {
+void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
+ const Tegra::Engines::ConstBufferInfo& buffer,
+ const ConstBufferEntry& entry, bool use_unified,
+ std::size_t unified_offset) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
- bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
- sizeof(float));
+ if (device.UseAssemblyShaders()) {
+ glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
+ } else {
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
+ }
return;
}
@@ -780,68 +968,112 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
// UBO alignment requirements.
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
- const auto alignment = device.GetUniformBufferAlignment();
- const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
- device.HasFastBufferSubData());
- bind_ubo_pushbuffer.Push(binding, cbuf, offset, size);
+ const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+ const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+ const GPUVAddr gpu_addr = buffer.address;
+ auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+ if (device.UseAssemblyShaders()) {
+ UNIMPLEMENTED_IF(use_unified);
+ if (info.offset != 0) {
+ const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+ glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+ info.handle = staging_cbuf;
+ info.offset = 0;
+ }
+ glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
+ return;
+ }
+
+ if (use_unified) {
+ glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+ unified_offset, size);
+ } else {
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
+ }
}
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
- auto& gpu{system.GPU()};
- auto& memory_manager{gpu.MemoryManager()};
- const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
+ static constexpr std::array TARGET_LUT = {
+ GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
+ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
+ };
+
+ const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
+ const auto& entries{shader->GetEntries().global_memory_entries};
- u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
- for (const auto& entry : shader->GetEntries().global_memory_entries) {
- const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
- const auto gpu_addr{memory_manager.Read<u64>(addr)};
- const auto size{memory_manager.Read<u32>(addr + 8)};
- SetupGlobalMemory(binding++, entry, gpu_addr, size);
+ std::array<BindlessSSBO, 32> ssbos;
+ ASSERT(entries.size() < ssbos.size());
+
+ const bool assembly_shaders = device.UseAssemblyShaders();
+ u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
+ for (const auto& entry : entries) {
+ const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
+ const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+ const u32 size{gpu_memory.Read<u32>(addr + 8)};
+ SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
+ ++binding;
+ }
+ if (assembly_shaders) {
+ UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
}
}
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
- auto& gpu{system.GPU()};
- auto& memory_manager{gpu.MemoryManager()};
- const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
+ const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
+ const auto& entries{kernel->GetEntries().global_memory_entries};
+
+ std::array<BindlessSSBO, 32> ssbos;
+ ASSERT(entries.size() < ssbos.size());
u32 binding = 0;
- for (const auto& entry : kernel->GetEntries().global_memory_entries) {
- const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
- const auto gpu_addr{memory_manager.Read<u64>(addr)};
- const auto size{memory_manager.Read<u32>(addr + 8)};
- SetupGlobalMemory(binding++, entry, gpu_addr, size);
+ for (const auto& entry : entries) {
+ const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
+ const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
+ const u32 size{gpu_memory.Read<u32>(addr + 8)};
+ SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
+ ++binding;
+ }
+ if (device.UseAssemblyShaders()) {
+ UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
}
}
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
- GPUVAddr gpu_addr, std::size_t size) {
- const auto alignment{device.GetShaderStorageBufferAlignment()};
- const auto [ssbo, buffer_offset] =
- buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
- bind_ssbo_pushbuffer.Push(binding, ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+ GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
+ const size_t alignment{device.GetShaderStorageBufferAlignment()};
+ const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+ if (device.UseAssemblyShaders()) {
+ *ssbo = BindlessSSBO{
+ .address = static_cast<GLuint64EXT>(info.address + info.offset),
+ .length = static_cast<GLsizei>(size),
+ .padding = 0,
+ };
+ } else {
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
+ static_cast<GLsizeiptr>(size));
+ }
}
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
MICROPROFILE_SCOPE(OpenGL_Texture);
- const auto& maxwell3d = system.GPU().Maxwell3D();
u32 binding = device.GetBaseBindings(stage_index).sampler;
for (const auto& entry : shader->GetEntries().samplers) {
const auto shader_type = static_cast<ShaderType>(stage_index);
- for (std::size_t i = 0; i < entry.Size(); ++i) {
+ for (std::size_t i = 0; i < entry.size; ++i) {
const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
SetupTexture(binding++, texture, entry);
}
}
}
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_Texture);
- const auto& compute = system.GPU().KeplerCompute();
u32 binding = 0;
for (const auto& entry : kernel->GetEntries().samplers) {
- for (std::size_t i = 0; i < entry.Size(); ++i) {
- const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
+ for (std::size_t i = 0; i < entry.size; ++i) {
+ const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i);
SetupTexture(binding++, texture, entry);
}
}
@@ -856,33 +1088,27 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
glBindTextureUnit(binding, 0);
return;
}
- glBindTextureUnit(binding, view->GetTexture());
-
- if (view->GetSurfaceParams().IsBuffer()) {
- return;
+ const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+ texture.tic.z_source, texture.tic.w_source);
+ glBindTextureUnit(binding, handle);
+ if (!view->GetSurfaceParams().IsBuffer()) {
+ glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
- // Apply swizzle to textures that are not buffers.
- view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
- texture.tic.w_source);
-
- glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
- const auto& maxwell3d = system.GPU().Maxwell3D();
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
u32 binding = device.GetBaseBindings(stage_index).image;
for (const auto& entry : shader->GetEntries().images) {
- const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
+ const auto shader_type = static_cast<ShaderType>(stage_index);
const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
SetupImage(binding++, tic, entry);
}
}
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
- const auto& compute = system.GPU().KeplerCompute();
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
u32 binding = 0;
for (const auto& entry : shader->GetEntries().images) {
- const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
+ const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic;
SetupImage(binding++, tic, entry);
}
}
@@ -894,27 +1120,43 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
return;
}
- if (!tic.IsBuffer()) {
- view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
- }
- if (entry.IsWritten()) {
+ if (entry.is_written) {
view->MarkAsModified(texture_cache.Tick());
}
- glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
- view->GetFormat());
+ const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+ glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
}
void RasterizerOpenGL::SyncViewport() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
- const auto& regs = gpu.regs;
+ auto& flags = maxwell3d.dirty.flags;
+ const auto& regs = maxwell3d.regs;
const bool dirty_viewport = flags[Dirty::Viewports];
+ const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+ if (dirty_clip_control || flags[Dirty::FrontFace]) {
+ flags[Dirty::FrontFace] = false;
+
+ GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+ if (regs.screen_y_control.triangle_rast_flip != 0 &&
+ regs.viewport_transform[0].scale_y < 0.0f) {
+ switch (mode) {
+ case GL_CW:
+ mode = GL_CCW;
+ break;
+ case GL_CCW:
+ mode = GL_CW;
+ break;
+ }
+ }
+ glFrontFace(mode);
+ }
+
if (dirty_viewport || flags[Dirty::ClipControl]) {
flags[Dirty::ClipControl] = false;
bool flip_y = false;
- if (regs.viewport_transform[0].scale_y < 0.0) {
+ if (regs.viewport_transform[0].scale_y < 0.0f) {
flip_y = !flip_y;
}
if (regs.screen_y_control.y_negate != 0) {
@@ -946,34 +1188,36 @@ void RasterizerOpenGL::SyncViewport() {
const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z;
const GLdouble far_depth = src.translate_z + src.scale_z;
glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth);
+
+ if (!GLAD_GL_NV_viewport_swizzle) {
+ continue;
+ }
+ glViewportSwizzleNV(static_cast<GLuint>(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x),
+ MaxwellToGL::ViewportSwizzle(src.swizzle.y),
+ MaxwellToGL::ViewportSwizzle(src.swizzle.z),
+ MaxwellToGL::ViewportSwizzle(src.swizzle.w));
}
}
}
void RasterizerOpenGL::SyncDepthClamp() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::DepthClampEnabled]) {
return;
}
flags[Dirty::DepthClampEnabled] = false;
- const auto& state = gpu.regs.view_volume_clip_control;
- UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near,
- "Unimplemented depth clamp separation!");
-
- oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near);
+ oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0);
}
void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) {
return;
}
flags[Dirty::ClipDistances] = false;
- clip_mask &= gpu.regs.clip_distance_enabled;
+ clip_mask &= maxwell3d.regs.clip_distance_enabled;
if (clip_mask == last_clip_distance_mask) {
return;
}
@@ -989,9 +1233,8 @@ void RasterizerOpenGL::SyncClipCoef() {
}
void RasterizerOpenGL::SyncCullMode() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
- const auto& regs = gpu.regs;
+ auto& flags = maxwell3d.dirty.flags;
+ const auto& regs = maxwell3d.regs;
if (flags[Dirty::CullTest]) {
flags[Dirty::CullTest] = false;
@@ -1003,34 +1246,27 @@ void RasterizerOpenGL::SyncCullMode() {
glDisable(GL_CULL_FACE);
}
}
-
- if (flags[Dirty::FrontFace]) {
- flags[Dirty::FrontFace] = false;
- glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
- }
}
void RasterizerOpenGL::SyncPrimitiveRestart() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::PrimitiveRestart]) {
return;
}
flags[Dirty::PrimitiveRestart] = false;
- if (gpu.regs.primitive_restart.enabled) {
+ if (maxwell3d.regs.primitive_restart.enabled) {
glEnable(GL_PRIMITIVE_RESTART);
- glPrimitiveRestartIndex(gpu.regs.primitive_restart.index);
+ glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index);
} else {
glDisable(GL_PRIMITIVE_RESTART);
}
}
void RasterizerOpenGL::SyncDepthTestState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
+ const auto& regs = maxwell3d.regs;
- const auto& regs = gpu.regs;
if (flags[Dirty::DepthMask]) {
flags[Dirty::DepthMask] = false;
glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE);
@@ -1048,14 +1284,13 @@ void RasterizerOpenGL::SyncDepthTestState() {
}
void RasterizerOpenGL::SyncStencilTestState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::StencilTest]) {
return;
}
flags[Dirty::StencilTest] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
oglEnable(GL_STENCIL_TEST, regs.stencil_enable);
glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func),
@@ -1080,25 +1315,24 @@ void RasterizerOpenGL::SyncStencilTestState() {
}
void RasterizerOpenGL::SyncRasterizeEnable() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::RasterizeEnable]) {
return;
}
flags[Dirty::RasterizeEnable] = false;
- oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0);
+ oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0);
}
void RasterizerOpenGL::SyncPolygonModes() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::PolygonModes]) {
return;
}
flags[Dirty::PolygonModes] = false;
- if (gpu.regs.fill_rectangle) {
+ const auto& regs = maxwell3d.regs;
+ if (regs.fill_rectangle) {
if (!GLAD_GL_NV_fill_rectangle) {
LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported");
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
@@ -1111,27 +1345,26 @@ void RasterizerOpenGL::SyncPolygonModes() {
return;
}
- if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) {
+ if (regs.polygon_mode_front == regs.polygon_mode_back) {
flags[Dirty::PolygonModeFront] = false;
flags[Dirty::PolygonModeBack] = false;
- glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+ glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
return;
}
if (flags[Dirty::PolygonModeFront]) {
flags[Dirty::PolygonModeFront] = false;
- glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+ glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front));
}
if (flags[Dirty::PolygonModeBack]) {
flags[Dirty::PolygonModeBack] = false;
- glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back));
+ glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back));
}
}
void RasterizerOpenGL::SyncColorMask() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::ColorMasks]) {
return;
}
@@ -1140,7 +1373,7 @@ void RasterizerOpenGL::SyncColorMask() {
const bool force = flags[Dirty::ColorMaskCommon];
flags[Dirty::ColorMaskCommon] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
if (regs.color_mask_common) {
if (!force && !flags[Dirty::ColorMask0]) {
return;
@@ -1165,33 +1398,30 @@ void RasterizerOpenGL::SyncColorMask() {
}
void RasterizerOpenGL::SyncMultiSampleState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::MultisampleControl]) {
return;
}
flags[Dirty::MultisampleControl] = false;
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& regs = maxwell3d.regs;
oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage);
oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one);
}
void RasterizerOpenGL::SyncFragmentColorClampState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::FragmentClampColor]) {
return;
}
flags[Dirty::FragmentClampColor] = false;
- glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
+ glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE);
}
void RasterizerOpenGL::SyncBlendState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
- const auto& regs = gpu.regs;
+ auto& flags = maxwell3d.dirty.flags;
+ const auto& regs = maxwell3d.regs;
if (flags[Dirty::BlendColor]) {
flags[Dirty::BlendColor] = false;
@@ -1248,14 +1478,13 @@ void RasterizerOpenGL::SyncBlendState() {
}
void RasterizerOpenGL::SyncLogicOpState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::LogicOp]) {
return;
}
flags[Dirty::LogicOp] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
if (regs.logic_op.enable) {
glEnable(GL_COLOR_LOGIC_OP);
glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation));
@@ -1265,14 +1494,13 @@ void RasterizerOpenGL::SyncLogicOpState() {
}
void RasterizerOpenGL::SyncScissorTest() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::Scissors]) {
return;
}
flags[Dirty::Scissors] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) {
if (!flags[Dirty::Scissor0 + index]) {
continue;
@@ -1291,16 +1519,15 @@ void RasterizerOpenGL::SyncScissorTest() {
}
void RasterizerOpenGL::SyncPointState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::PointSize]) {
return;
}
flags[Dirty::PointSize] = false;
- oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable);
+ oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
- if (gpu.regs.vp_point_size.enable) {
+ if (maxwell3d.regs.vp_point_size.enable) {
// By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled.
glEnable(GL_PROGRAM_POINT_SIZE);
return;
@@ -1308,32 +1535,30 @@ void RasterizerOpenGL::SyncPointState() {
// Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
// in OpenGL).
- glPointSize(std::max(1.0f, gpu.regs.point_size));
+ glPointSize(std::max(1.0f, maxwell3d.regs.point_size));
glDisable(GL_PROGRAM_POINT_SIZE);
}
void RasterizerOpenGL::SyncLineState() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::LineWidth]) {
return;
}
flags[Dirty::LineWidth] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable);
glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased);
}
void RasterizerOpenGL::SyncPolygonOffset() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::PolygonOffset]) {
return;
}
flags[Dirty::PolygonOffset] = false;
- const auto& regs = gpu.regs;
+ const auto& regs = maxwell3d.regs;
oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable);
oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable);
oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable);
@@ -1347,18 +1572,13 @@ void RasterizerOpenGL::SyncPolygonOffset() {
}
void RasterizerOpenGL::SyncAlphaTest() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::AlphaTest]) {
return;
}
flags[Dirty::AlphaTest] = false;
- const auto& regs = gpu.regs;
- if (regs.alpha_test_enabled && regs.rt_control.count > 1) {
- LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested");
- }
-
+ const auto& regs = maxwell3d.regs;
if (regs.alpha_test_enabled) {
glEnable(GL_ALPHA_TEST);
glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);
@@ -1368,22 +1588,79 @@ void RasterizerOpenGL::SyncAlphaTest() {
}
void RasterizerOpenGL::SyncFramebufferSRGB() {
- auto& gpu = system.GPU().Maxwell3D();
- auto& flags = gpu.dirty.flags;
+ auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::FramebufferSRGB]) {
return;
}
flags[Dirty::FramebufferSRGB] = false;
- oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
+ oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb);
+}
+
+void RasterizerOpenGL::SyncTransformFeedback() {
+ // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+ // when this is required.
+ const auto& regs = maxwell3d.regs;
+
+ static constexpr std::size_t STRIDE = 3;
+ std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+ std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+ GLint* cursor = attribs.data();
+ GLint* current_stream = streams.data();
+
+ for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+ const auto& layout = regs.tfb_layouts[feedback];
+ UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+ if (layout.varying_count == 0) {
+ continue;
+ }
+
+ *current_stream = static_cast<GLint>(feedback);
+ if (current_stream != streams.data()) {
+ // When stepping one stream, push the expected token
+ cursor[0] = GL_NEXT_BUFFER_NV;
+ cursor[1] = 0;
+ cursor[2] = 0;
+ cursor += STRIDE;
+ }
+ ++current_stream;
+
+ const auto& locations = regs.tfb_varying_locs[feedback];
+ std::optional<u8> current_index;
+ for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+ const u8 location = locations[offset];
+ const u8 index = location / 4;
+
+ if (current_index == index) {
+ // Increase number of components of the previous attachment
+ ++cursor[-2];
+ continue;
+ }
+ current_index = index;
+
+ std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+ cursor[1] = 1;
+ cursor += STRIDE;
+ }
+ }
+
+ const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+ const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+ glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+ GL_INTERLEAVED_ATTRIBS);
}
void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& regs = maxwell3d.regs;
if (regs.tfb_enabled == 0) {
return;
}
+ if (device.UseAssemblyShaders()) {
+ SyncTransformFeedback();
+ }
+
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1410,11 +1687,15 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
static_cast<GLsizeiptr>(size));
}
+ // We may have to call BeginTransformFeedbackNV here since they seem to call different
+ // implementations on Nvidia's driver (the pointer is different) but we are using
+ // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+ // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
glBeginTransformFeedback(GL_POINTS);
}
void RasterizerOpenGL::EndTransformFeedback() {
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& regs = maxwell3d.regs;
if (regs.tfb_enabled == 0) {
return;
}
@@ -1431,8 +1712,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
const GLuint handle = transform_feedback_buffers[index].handle;
const GPUVAddr gpu_addr = binding.Address();
const std::size_t size = binding.buffer_size;
- const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
- glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+ const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+ glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+ static_cast<GLsizeiptr>(size));
}
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 435da4425..1d0f585fa 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,10 +19,10 @@
#include "video_core/engines/const_buffer_info.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_fence_manager.h"
#include "video_core/renderer_opengl/gl_framebuffer_cache.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -33,10 +33,11 @@
#include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/gl_texture_cache.h"
#include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/async_shaders.h"
#include "video_core/textures/texture.h"
-namespace Core {
-class System;
+namespace Core::Memory {
+class Memory;
}
namespace Core::Frontend {
@@ -52,10 +53,18 @@ namespace OpenGL {
struct ScreenInfo;
struct DrawParameters;
+struct BindlessSSBO {
+ GLuint64EXT address;
+ GLsizei length;
+ GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
+
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public:
- explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
- ScreenInfo& info, GLShader::ProgramManager& program_manager,
+ explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
+ Core::Memory::Memory& cpu_memory, const Device& device,
+ ScreenInfo& screen_info, ProgramManager& program_manager,
StateTracker& state_tracker);
~RasterizerOpenGL() override;
@@ -66,8 +75,15 @@ public:
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override;
+ bool MustFlushRegion(VAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override;
+ void OnCPUWrite(VAddr addr, u64 size) override;
+ void SyncGuestHost() override;
+ void SignalSemaphore(GPUVAddr addr, u32 value) override;
+ void SignalSyncPoint(u32 value) override;
+ void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+ void WaitForIdle() override;
void FlushCommands() override;
void TickFrame() override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@@ -75,56 +91,65 @@ public:
const Tegra::Engines::Fermi2D::Config& copy_config) override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
- void LoadDiskResources(const std::atomic_bool& stop_loading,
+ void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) override;
- void SetupDirtyFlags() override;
/// Returns true when there are commands queued to the OpenGL server.
bool AnyCommandQueued() const {
return num_queued_commands > 0;
}
+ VideoCommon::Shader::AsyncShaders& GetAsyncShaders() {
+ return async_shaders;
+ }
+
+ const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const {
+ return async_shaders;
+ }
+
private:
/// Configures the color and depth framebuffer states.
void ConfigureFramebuffers();
- void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb);
+ /// Configures the color and depth framebuffer for clearing.
+ void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
/// Configures the current constbuffers to use for the draw command.
- void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+ void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
/// Configures the current constbuffers to use for the kernel invocation.
- void SetupComputeConstBuffers(const Shader& kernel);
+ void SetupComputeConstBuffers(Shader* kernel);
/// Configures a constant buffer.
- void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry);
+ void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+ const ConstBufferEntry& entry, bool use_unified,
+ std::size_t unified_offset);
/// Configures the current global memory entries to use for the draw command.
- void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+ void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
/// Configures the current global memory entries to use for the kernel invocation.
- void SetupComputeGlobalMemory(const Shader& kernel);
+ void SetupComputeGlobalMemory(Shader* kernel);
- /// Configures a constant buffer.
+ /// Configures a global memory buffer.
void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
- std::size_t size);
+ size_t size, BindlessSSBO* ssbo);
/// Configures the current textures to use for the draw command.
- void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+ void SetupDrawTextures(std::size_t stage_index, Shader* shader);
/// Configures the textures used in a compute shader.
- void SetupComputeTextures(const Shader& kernel);
+ void SetupComputeTextures(Shader* kernel);
/// Configures a texture.
void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
const SamplerEntry& entry);
/// Configures images in a graphics shader.
- void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+ void SetupDrawImages(std::size_t stage_index, Shader* shader);
/// Configures images in a compute shader.
- void SetupComputeImages(const Shader& shader);
+ void SetupComputeImages(Shader* shader);
/// Configures an image.
void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -192,6 +217,10 @@ private:
/// Syncs the framebuffer sRGB state to match the guest state
void SyncFramebufferSRGB();
+ /// Syncs transform feedback state to match guest state
+ /// @note Only valid on assembly shaders
+ void SyncTransformFeedback();
+
/// Begin a transform feedback
void BeginTransformFeedback(GLenum primitive_mode);
@@ -215,31 +244,42 @@ private:
void SetupShaders(GLenum primitive_mode);
- const Device device;
+ Tegra::GPU& gpu;
+ Tegra::Engines::Maxwell3D& maxwell3d;
+ Tegra::Engines::KeplerCompute& kepler_compute;
+ Tegra::MemoryManager& gpu_memory;
+
+ const Device& device;
+ ScreenInfo& screen_info;
+ ProgramManager& program_manager;
+ StateTracker& state_tracker;
TextureCacheOpenGL texture_cache;
ShaderCacheOpenGL shader_cache;
SamplerCacheOpenGL sampler_cache;
FramebufferCacheOpenGL framebuffer_cache;
QueryCache query_cache;
+ OGLBufferCache buffer_cache;
+ FenceManagerOpenGL fence_manager;
- Core::System& system;
- ScreenInfo& screen_info;
- GLShader::ProgramManager& program_manager;
- StateTracker& state_tracker;
+ VideoCommon::Shader::AsyncShaders async_shaders;
static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
- OGLBufferCache buffer_cache;
- VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker};
- BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
- BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
+ GLint vertex_binding = 0;
std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
transform_feedback_buffers;
std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
enabled_transform_feedback_buffers;
+ static constexpr std::size_t NUM_CONSTANT_BUFFERS =
+ Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+ Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+ std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
+ std::size_t current_cbuf = 0;
+ OGLBuffer unified_uniform_buffer;
+
/// Number of commands queued to the OpenGL driver. Reseted on flush.
std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 97803d480..0ebcec427 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <string_view>
#include <utility>
#include <glad/glad.h>
#include "common/common_types.h"
@@ -82,11 +83,13 @@ void OGLSampler::Release() {
handle = 0;
}
-void OGLShader::Create(const char* source, GLenum type) {
- if (handle != 0)
+void OGLShader::Create(std::string_view source, GLenum type) {
+ if (handle != 0) {
return;
- if (source == nullptr)
+ }
+ if (source.empty()) {
return;
+ }
MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
handle = GLShader::LoadShader(source, type);
@@ -125,6 +128,15 @@ void OGLProgram::Release() {
handle = 0;
}
+void OGLAssemblyProgram::Release() {
+ if (handle == 0) {
+ return;
+ }
+ MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+ glDeleteProgramsARB(1, &handle);
+ handle = 0;
+}
+
void OGLPipeline::Create() {
if (handle != 0)
return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index de93f4212..f48398669 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -4,6 +4,7 @@
#pragma once
+#include <string_view>
#include <utility>
#include <glad/glad.h>
#include "common/common_types.h"
@@ -127,7 +128,7 @@ public:
return *this;
}
- void Create(const char* source, GLenum type);
+ void Create(std::string_view source, GLenum type);
void Release();
@@ -167,6 +168,28 @@ public:
GLuint handle = 0;
};
+class OGLAssemblyProgram : private NonCopyable {
+public:
+ OGLAssemblyProgram() = default;
+
+ OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+ ~OGLAssemblyProgram() {
+ Release();
+ }
+
+ OGLAssemblyProgram& operator=(OGLAssemblyProgram&& o) noexcept {
+ Release();
+ handle = std::exchange(o.handle, 0);
+ return *this;
+ }
+
+ /// Deletes the internal OpenGL resource
+ void Release();
+
+ GLuint handle = 0;
+};
+
class OGLPipeline : private NonCopyable {
public:
OGLPipeline() = default;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 12c6dcfde..bd56bed0c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -10,8 +10,6 @@
#include <thread>
#include <unordered_set>
-#include <boost/functional/hash.hpp>
-
#include "common/alignment.h"
#include "common/assert.h"
#include "common/logging/log.h"
@@ -22,83 +20,35 @@
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/memory_util.h"
#include "video_core/shader/registry.h"
#include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
+#include "video_core/shader_notify.h"
namespace OpenGL {
using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::CompileDepth;
-using VideoCommon::Shader::CompilerSettings;
+using VideoCommon::Shader::GetShaderAddress;
+using VideoCommon::Shader::GetShaderCode;
+using VideoCommon::Shader::GetUniqueIdentifier;
+using VideoCommon::Shader::KERNEL_MAIN_OFFSET;
using VideoCommon::Shader::ProgramCode;
using VideoCommon::Shader::Registry;
using VideoCommon::Shader::ShaderIR;
+using VideoCommon::Shader::STAGE_MAIN_OFFSET;
namespace {
-constexpr u32 STAGE_MAIN_OFFSET = 10;
-constexpr u32 KERNEL_MAIN_OFFSET = 0;
-
-constexpr CompilerSettings COMPILER_SETTINGS{CompileDepth::FullDecompile};
-
-/// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
- const auto& gpu{system.GPU().Maxwell3D()};
- const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
- return gpu.regs.code_address.CodeAddress() + shader_config.offset;
-}
-
-/// Gets if the current instruction offset is a scheduler instruction
-constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
- // Sched instructions appear once every 4 instructions.
- constexpr std::size_t SchedPeriod = 4;
- const std::size_t absolute_offset = offset - main_offset;
- return (absolute_offset % SchedPeriod) == 0;
-}
-
-/// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const ProgramCode& program) {
- constexpr std::size_t start_offset = 10;
- // This is the encoded version of BRA that jumps to itself. All Nvidia
- // shaders end with one.
- constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL;
- constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL;
- std::size_t offset = start_offset;
- while (offset < program.size()) {
- const u64 instruction = program[offset];
- if (!IsSchedInstruction(offset, start_offset)) {
- if ((instruction & mask) == self_jumping_branch) {
- // End on Maxwell's "nop" instruction
- break;
- }
- if (instruction == 0) {
- break;
- }
- }
- offset++;
- }
- // The last instruction is included in the program size
- return std::min(offset + 1, program.size());
-}
-
-/// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
- const u8* host_ptr) {
- ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
- ASSERT_OR_EXECUTE(host_ptr != nullptr, {
- std::fill(code.begin(), code.end(), 0);
- return code;
- });
- memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64));
- code.resize(CalculateProgramSize(code));
- return code;
-}
+constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{};
/// Gets the shader type from a Maxwell program type
constexpr GLenum GetGLShaderType(ShaderType shader_type) {
@@ -116,17 +66,6 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) {
}
}
-/// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code,
- const ProgramCode& code_b = {}) {
- u64 unique_identifier = boost::hash_value(code);
- if (is_a) {
- // VertexA programs include two programs
- boost::hash_combine(unique_identifier, boost::hash_value(code_b));
- }
- return unique_identifier;
-}
-
constexpr const char* GetShaderTypeName(ShaderType shader_type) {
switch (shader_type) {
case ShaderType::Vertex:
@@ -162,6 +101,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
return {};
}
+constexpr GLenum AssemblyEnum(ShaderType shader_type) {
+ switch (shader_type) {
+ case ShaderType::Vertex:
+ return GL_VERTEX_PROGRAM_NV;
+ case ShaderType::TesselationControl:
+ return GL_TESS_CONTROL_PROGRAM_NV;
+ case ShaderType::TesselationEval:
+ return GL_TESS_EVALUATION_PROGRAM_NV;
+ case ShaderType::Geometry:
+ return GL_GEOMETRY_PROGRAM_NV;
+ case ShaderType::Fragment:
+ return GL_FRAGMENT_PROGRAM_NV;
+ case ShaderType::Compute:
+ return GL_COMPUTE_PROGRAM_NV;
+ }
+ return {};
+}
+
std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
}
@@ -170,7 +127,7 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size};
const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer,
entry.graphics_info, entry.compute_info};
- const auto registry = std::make_shared<Registry>(entry.type, info);
+ auto registry = std::make_shared<Registry>(entry.type, info);
for (const auto& [address, value] : entry.keys) {
const auto [buffer, offset] = address;
registry->InsertKey(buffer, offset, value);
@@ -185,21 +142,6 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
return registry;
}
-std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
- u64 unique_identifier, const ShaderIR& ir,
- const Registry& registry, bool hint_retrievable = false) {
- const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
- LOG_INFO(Render_OpenGL, "{}", shader_id);
-
- const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
- OGLShader shader;
- shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
-
- auto program = std::make_shared<OGLProgram>();
- program->Create(true, hint_retrievable, shader.handle);
- return program;
-}
-
std::unordered_set<GLenum> GetSupportedFormats() {
GLint num_formats;
glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);
@@ -216,55 +158,138 @@ std::unordered_set<GLenum> GetSupportedFormats() {
} // Anonymous namespace
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
- std::shared_ptr<VideoCommon::Shader::Registry> registry,
- ShaderEntries entries, std::shared_ptr<OGLProgram> program)
- : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
- size_in_bytes{size_in_bytes}, program{std::move(program)} {}
+ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
+ const ShaderIR& ir, const Registry& registry, bool hint_retrievable) {
+ const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
+ LOG_INFO(Render_OpenGL, "{}", shader_id);
+
+ auto program = std::make_shared<ProgramHandle>();
+
+ if (device.UseAssemblyShaders()) {
+ const std::string arb =
+ DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
+
+ GLuint& arb_prog = program->assembly_program.handle;
+
+// Commented out functions signal OpenGL errors but are compatible with apitrace.
+// Use them only to capture and replay on apitrace.
+#if 0
+ glGenProgramsNV(1, &arb_prog);
+ glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
+ reinterpret_cast<const GLubyte*>(arb.data()));
+#else
+ glGenProgramsARB(1, &arb_prog);
+ glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
+ static_cast<GLsizei>(arb.size()), arb.data());
+#endif
+ const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
+ if (err && *err) {
+ LOG_CRITICAL(Render_OpenGL, "{}", err);
+ LOG_INFO(Render_OpenGL, "\n{}", arb);
+ }
+ } else {
+ const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
+ OGLShader shader;
+ shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+
+ program->source_program.Create(true, hint_retrievable, shader.handle);
+ }
-CachedShader::~CachedShader() = default;
+ return program;
+}
-GLuint CachedShader::GetHandle() const {
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+ ProgramSharedPtr program_, bool is_built)
+ : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)},
+ is_built(is_built) {
+ handle = program->assembly_program.handle;
+ if (handle == 0) {
+ handle = program->source_program.handle;
+ }
+ if (is_built) {
+ ASSERT(handle != 0);
+ }
+}
+
+Shader::~Shader() = default;
+
+GLuint Shader::GetHandle() const {
DEBUG_ASSERT(registry->IsConsistent());
- return program->handle;
+ return handle;
}
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
- Maxwell::ShaderProgram program_type, ProgramCode code,
- ProgramCode code_b) {
+bool Shader::IsBuilt() const {
+ return is_built;
+}
+
+void Shader::AsyncOpenGLBuilt(OGLProgram new_program) {
+ program->source_program = std::move(new_program);
+ handle = program->source_program.handle;
+ is_built = true;
+}
+
+void Shader::AsyncGLASMBuilt(OGLAssemblyProgram new_program) {
+ program->assembly_program = std::move(new_program);
+ handle = program->assembly_program.handle;
+ is_built = true;
+}
+
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(
+ const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code,
+ ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) {
const auto shader_type = GetShaderType(program_type);
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
- auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D());
- const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
- // TODO(Rodrigo): Handle VertexA shaders
- // std::optional<ShaderIR> ir_b;
- // if (!code_b.empty()) {
- // ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
- // }
- auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+ auto& gpu = params.gpu;
+ gpu.ShaderNotify().MarkSharderBuilding();
+
+ auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D());
+ if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) {
+ const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+ // TODO(Rodrigo): Handle VertexA shaders
+ // std::optional<ShaderIR> ir_b;
+ // if (!code_b.empty()) {
+ // ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
+ // }
+ auto program =
+ BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+ ShaderDiskCacheEntry entry;
+ entry.type = shader_type;
+ entry.code = std::move(code);
+ entry.code_b = std::move(code_b);
+ entry.unique_identifier = params.unique_identifier;
+ entry.bound_buffer = registry->GetBoundBuffer();
+ entry.graphics_info = registry->GetGraphicsInfo();
+ entry.keys = registry->GetKeys();
+ entry.bound_samplers = registry->GetBoundSamplers();
+ entry.bindless_samplers = registry->GetBindlessSamplers();
+ params.disk_cache.SaveEntry(std::move(entry));
+
+ gpu.ShaderNotify().MarkShaderComplete();
+
+ return std::unique_ptr<Shader>(new Shader(std::move(registry),
+ MakeEntries(params.device, ir, shader_type),
+ std::move(program), true));
+ } else {
+ // Required for entries
+ const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+ auto entries = MakeEntries(params.device, ir, shader_type);
- ShaderDiskCacheEntry entry;
- entry.type = shader_type;
- entry.code = std::move(code);
- entry.code_b = std::move(code_b);
- entry.unique_identifier = params.unique_identifier;
- entry.bound_buffer = registry->GetBoundBuffer();
- entry.graphics_info = registry->GetGraphicsInfo();
- entry.keys = registry->GetKeys();
- entry.bound_samplers = registry->GetBoundSamplers();
- entry.bindless_samplers = registry->GetBindlessSamplers();
- params.disk_cache.SaveEntry(std::move(entry));
+ async_shaders.QueueOpenGLShader(params.device, shader_type, params.unique_identifier,
+ std::move(code), std::move(code_b), STAGE_MAIN_OFFSET,
+ COMPILER_SETTINGS, *registry, cpu_addr);
- return std::shared_ptr<CachedShader>(new CachedShader(
- params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+ auto program = std::make_shared<ProgramHandle>();
+ return std::unique_ptr<Shader>(
+ new Shader(std::move(registry), std::move(entries), std::move(program), false));
+ }
}
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+ ProgramCode code) {
+ auto& gpu = params.gpu;
+ gpu.ShaderNotify().MarkSharderBuilding();
- auto& engine = params.system.GPU().KeplerCompute();
- auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
+ auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine);
const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
const u64 uid = params.unique_identifier;
auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
@@ -280,31 +305,43 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
- return std::shared_ptr<CachedShader>(new CachedShader(
- params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+ gpu.ShaderNotify().MarkShaderComplete();
+
+ return std::unique_ptr<Shader>(new Shader(std::move(registry),
+ MakeEntries(params.device, ir, ShaderType::Compute),
+ std::move(program)));
}
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
- const PrecompiledShader& precompiled_shader,
- std::size_t size_in_bytes) {
- return std::shared_ptr<CachedShader>(
- new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
- precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+ const PrecompiledShader& precompiled_shader) {
+ return std::unique_ptr<Shader>(new Shader(
+ precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
}
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
- Core::Frontend::EmuWindow& emu_window, const Device& device)
- : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
- disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer,
+ Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
+ Tegra::Engines::Maxwell3D& maxwell3d_,
+ Tegra::Engines::KeplerCompute& kepler_compute_,
+ Tegra::MemoryManager& gpu_memory_, const Device& device_)
+ : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_},
+ gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_},
+ kepler_compute{kepler_compute_}, device{device_} {}
-void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
+
+void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) {
+ disk_cache.BindTitleID(title_id);
const std::optional transferable = disk_cache.LoadTransferable();
if (!transferable) {
return;
}
- const std::vector gl_cache = disk_cache.LoadPrecompiled();
+ std::vector<ShaderDiskCachePrecompiled> gl_cache;
+ if (!device.UseAssemblyShaders()) {
+ // Only load precompiled cache when we are not using assembly shaders
+ gl_cache = disk_cache.LoadPrecompiled();
+ }
const auto supported_formats = GetSupportedFormats();
// Track if precompiled cache was altered during loading to know if we have to
@@ -343,7 +380,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
auto registry = MakeRegistry(entry);
const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
- std::shared_ptr<OGLProgram> program;
+ ProgramSharedPtr program;
if (precompiled_entry) {
// If the shader is precompiled, attempt to load it with
program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -359,7 +396,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
PrecompiledShader shader;
shader.program = std::move(program);
shader.registry = std::move(registry);
- shader.entries = MakeEntries(ir);
+ shader.entries = MakeEntries(device, ir, entry.type);
std::scoped_lock lock{mutex};
if (callback) {
@@ -370,7 +407,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
}
};
- const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
+ const std::size_t num_workers{std::max(1U, std::thread::hardware_concurrency())};
const std::size_t bucket_size{transferable->size() / num_workers};
std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
std::vector<std::thread> threads(num_workers);
@@ -397,6 +434,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
return;
}
+ if (device.UseAssemblyShaders()) {
+ // Don't store precompiled binaries for assembly shaders.
+ return;
+ }
+
// TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
// before precompiling them
@@ -404,7 +446,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
const u64 id = (*transferable)[i].unique_identifier;
const auto it = find_precompiled(id);
if (it == gl_cache.end()) {
- const GLuint program = runtime_cache.at(id).program->handle;
+ const GLuint program = runtime_cache.at(id).program->source_program.handle;
disk_cache.SavePrecompiled(id, program);
precompiled_cache_altered = true;
}
@@ -415,7 +457,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
}
}
-std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats) {
if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -423,15 +465,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
return {};
}
- auto program = std::make_shared<OGLProgram>();
- program->handle = glCreateProgram();
- glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
- glProgramBinary(program->handle, precompiled_entry.binary_format,
- precompiled_entry.binary.data(),
+ auto program = std::make_shared<ProgramHandle>();
+ GLuint& handle = program->source_program.handle;
+ handle = glCreateProgram();
+ glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+ glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
static_cast<GLsizei>(precompiled_entry.binary.size()));
GLint link_status;
- glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
+ glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
if (link_status == GL_FALSE) {
LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
return {};
@@ -440,77 +482,122 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
return program;
}
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
- if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
- return last_shaders[static_cast<std::size_t>(program)];
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program,
+ VideoCommon::Shader::AsyncShaders& async_shaders) {
+ if (!maxwell3d.dirty.flags[Dirty::Shaders]) {
+ auto* last_shader = last_shaders[static_cast<std::size_t>(program)];
+ if (last_shader->IsBuilt()) {
+ return last_shader;
+ }
}
- auto& memory_manager{system.GPU().MemoryManager()};
- const GPUVAddr address{GetShaderAddress(system, program)};
+ const GPUVAddr address{GetShaderAddress(maxwell3d, program)};
+
+ if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) {
+ auto completed_work = async_shaders.GetCompletedWork();
+ for (auto& work : completed_work) {
+ Shader* shader = TryGet(work.cpu_address);
+ gpu.ShaderNotify().MarkShaderComplete();
+ if (shader == nullptr) {
+ continue;
+ }
+ using namespace VideoCommon::Shader;
+ if (work.backend == AsyncShaders::Backend::OpenGL) {
+ shader->AsyncOpenGLBuilt(std::move(work.program.opengl));
+ } else if (work.backend == AsyncShaders::Backend::GLASM) {
+ shader->AsyncGLASMBuilt(std::move(work.program.glasm));
+ }
+
+ auto& registry = shader->GetRegistry();
+
+ ShaderDiskCacheEntry entry;
+ entry.type = work.shader_type;
+ entry.code = std::move(work.code);
+ entry.code_b = std::move(work.code_b);
+ entry.unique_identifier = work.uid;
+ entry.bound_buffer = registry.GetBoundBuffer();
+ entry.graphics_info = registry.GetGraphicsInfo();
+ entry.keys = registry.GetKeys();
+ entry.bound_samplers = registry.GetBoundSamplers();
+ entry.bindless_samplers = registry.GetBindlessSamplers();
+ disk_cache.SaveEntry(std::move(entry));
+ }
+ }
// Look up shader in the cache based on address
- const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
- Shader shader{cpu_addr ? TryGet(*cpu_addr) : nullptr};
- if (shader) {
+ const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)};
+ if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
return last_shaders[static_cast<std::size_t>(program)] = shader;
}
- const auto host_ptr{memory_manager.GetPointer(address)};
+ const u8* const host_ptr{gpu_memory.GetPointer(address)};
// No shader found - create a new one
- ProgramCode code{GetShaderCode(memory_manager, address, host_ptr)};
+ ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)};
ProgramCode code_b;
if (program == Maxwell::ShaderProgram::VertexA) {
- const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
- code_b = GetShaderCode(memory_manager, address_b, memory_manager.GetPointer(address_b));
+ const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)};
+ const u8* host_ptr_b = gpu_memory.GetPointer(address_b);
+ code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false);
}
+ const std::size_t code_size = code.size() * sizeof(u64);
- const auto unique_identifier = GetUniqueIdentifier(
+ const u64 unique_identifier = GetUniqueIdentifier(
GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
- const ShaderParameters params{system, disk_cache, device,
- *cpu_addr, host_ptr, unique_identifier};
+ const ShaderParameters params{gpu, maxwell3d, disk_cache, device,
+ *cpu_addr, host_ptr, unique_identifier};
+ std::unique_ptr<Shader> shader;
const auto found = runtime_cache.find(unique_identifier);
if (found == runtime_cache.end()) {
- shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
- std::move(code_b));
+ shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b),
+ async_shaders, cpu_addr.value_or(0));
} else {
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
- shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+ shader = Shader::CreateFromCache(params, found->second);
}
- Register(shader);
- return last_shaders[static_cast<std::size_t>(program)] = shader;
+ Shader* const result = shader.get();
+ if (cpu_addr) {
+ Register(std::move(shader), *cpu_addr, code_size);
+ } else {
+ null_shader = std::move(shader);
+ }
+
+ return last_shaders[static_cast<std::size_t>(program)] = result;
}
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
- auto& memory_manager{system.GPU().MemoryManager()};
- const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+ const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)};
- auto kernel = cpu_addr ? TryGet(*cpu_addr) : nullptr;
- if (kernel) {
+ if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
return kernel;
}
- const auto host_ptr{memory_manager.GetPointer(code_addr)};
// No kernel found, create a new one
- auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
- const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+ const u8* host_ptr{gpu_memory.GetPointer(code_addr)};
+ ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)};
+ const std::size_t code_size{code.size() * sizeof(u64)};
+ const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
- const ShaderParameters params{system, disk_cache, device,
- *cpu_addr, host_ptr, unique_identifier};
+ const ShaderParameters params{gpu, kepler_compute, disk_cache, device,
+ *cpu_addr, host_ptr, unique_identifier};
+ std::unique_ptr<Shader> kernel;
const auto found = runtime_cache.find(unique_identifier);
if (found == runtime_cache.end()) {
- kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+ kernel = Shader::CreateKernelFromMemory(params, std::move(code));
} else {
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
- kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+ kernel = Shader::CreateFromCache(params, found->second);
}
- Register(kernel);
- return kernel;
+ Shader* const result = kernel.get();
+ if (cpu_addr) {
+ Register(std::move(kernel), *cpu_addr, code_size);
+ } else {
+ null_kernel = std::move(kernel);
+ }
+ return result;
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index c836df5bd..1708af06a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,114 +18,143 @@
#include "common/common_types.h"
#include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
#include "video_core/shader/registry.h"
#include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
-namespace Core {
-class System;
+namespace Tegra {
+class MemoryManager;
}
namespace Core::Frontend {
class EmuWindow;
}
+namespace VideoCommon::Shader {
+class AsyncShaders;
+}
+
namespace OpenGL {
-class CachedShader;
class Device;
class RasterizerOpenGL;
-struct UnspecializedShader;
-using Shader = std::shared_ptr<CachedShader>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+struct ProgramHandle {
+ OGLProgram source_program;
+ OGLAssemblyProgram assembly_program;
+};
+using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
+
struct PrecompiledShader {
- std::shared_ptr<OGLProgram> program;
+ ProgramSharedPtr program;
std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries;
};
struct ShaderParameters {
- Core::System& system;
+ Tegra::GPU& gpu;
+ Tegra::Engines::ConstBufferEngineInterface& engine;
ShaderDiskCacheOpenGL& disk_cache;
const Device& device;
VAddr cpu_addr;
- u8* host_ptr;
+ const u8* host_ptr;
u64 unique_identifier;
};
-class CachedShader final : public RasterizerCacheObject {
+ProgramSharedPtr BuildShader(const Device& device, Tegra::Engines::ShaderType shader_type,
+ u64 unique_identifier, const VideoCommon::Shader::ShaderIR& ir,
+ const VideoCommon::Shader::Registry& registry,
+ bool hint_retrievable = false);
+
+class Shader final {
public:
- ~CachedShader();
+ ~Shader();
/// Gets the GL program handle for the shader
GLuint GetHandle() const;
- /// Returns the size in bytes of the shader
- std::size_t GetSizeInBytes() const override {
- return size_in_bytes;
- }
+ bool IsBuilt() const;
/// Gets the shader entries for the shader
const ShaderEntries& GetEntries() const {
return entries;
}
- static Shader CreateStageFromMemory(const ShaderParameters& params,
- Maxwell::ShaderProgram program_type,
- ProgramCode program_code, ProgramCode program_code_b);
- static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+ const VideoCommon::Shader::Registry& GetRegistry() const {
+ return *registry;
+ }
+
+ /// Mark a OpenGL shader as built
+ void AsyncOpenGLBuilt(OGLProgram new_program);
+
+ /// Mark a GLASM shader as built
+ void AsyncGLASMBuilt(OGLAssemblyProgram new_program);
+
+ static std::unique_ptr<Shader> CreateStageFromMemory(
+ const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+ ProgramCode program_code, ProgramCode program_code_b,
+ VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr);
+
+ static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+ ProgramCode code);
- static Shader CreateFromCache(const ShaderParameters& params,
- const PrecompiledShader& precompiled_shader,
- std::size_t size_in_bytes);
+ static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+ const PrecompiledShader& precompiled_shader);
private:
- explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
- std::shared_ptr<VideoCommon::Shader::Registry> registry,
- ShaderEntries entries, std::shared_ptr<OGLProgram> program);
+ explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+ ProgramSharedPtr program, bool is_built = true);
std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries;
- std::size_t size_in_bytes = 0;
- std::shared_ptr<OGLProgram> program;
+ ProgramSharedPtr program;
+ GLuint handle = 0;
+ bool is_built{};
};
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
public:
- explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
- Core::Frontend::EmuWindow& emu_window, const Device& device);
+ explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window,
+ Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d,
+ Tegra::Engines::KeplerCompute& kepler_compute,
+ Tegra::MemoryManager& gpu_memory, const Device& device);
+ ~ShaderCacheOpenGL() override;
/// Loads disk cache for the current game
- void LoadDiskCache(const std::atomic_bool& stop_loading,
+ void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback);
/// Gets the current specified shader stage program
- Shader GetStageProgram(Maxwell::ShaderProgram program);
+ Shader* GetStageProgram(Maxwell::ShaderProgram program,
+ VideoCommon::Shader::AsyncShaders& async_shaders);
/// Gets a compute kernel in the passed address
- Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
- // We do not have to flush this cache as things in it are never modified by us.
- void FlushObjectInner(const Shader& object) override {}
+ Shader* GetComputeKernel(GPUVAddr code_addr);
private:
- std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+ ProgramSharedPtr GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats);
- Core::System& system;
Core::Frontend::EmuWindow& emu_window;
+ Tegra::GPU& gpu;
+ Tegra::MemoryManager& gpu_memory;
+ Tegra::Engines::Maxwell3D& maxwell3d;
+ Tegra::Engines::KeplerCompute& kepler_compute;
const Device& device;
+
ShaderDiskCacheOpenGL disk_cache;
std::unordered_map<u64, PrecompiledShader> runtime_cache;
- std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+ std::unique_ptr<Shader> null_shader;
+ std::unique_ptr<Shader> null_kernel;
+
+ std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index b1804e9ea..95ca96c8e 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
using Tegra::Shader::IpaSampleMode;
using Tegra::Shader::PixelImap;
using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
using VideoCommon::Shader::BuildTransformFeedback;
using VideoCommon::Shader::Registry;
@@ -61,8 +62,8 @@ struct TextureDerivates {};
using TextureArgument = std::pair<Type, Node>;
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
- static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
#define ftou floatBitsToUint
@@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+ const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+ // We waste one UBO for emulation
+ const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+ return num_ubos > num_available_ubos;
+}
+
struct GenericVaryingDescription {
std::string name;
u8 first_element = 0;
@@ -412,8 +420,9 @@ class GLSLDecompiler final {
public:
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier, std::string_view suffix)
- : device{device}, ir{ir}, registry{registry}, stage{stage},
- identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+ : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+ suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+ UseUnifiedUniforms(device, ir, stage)} {
if (stage != ShaderType::Compute) {
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
}
@@ -484,7 +493,7 @@ private:
code.AddLine("switch (jmp_to) {{");
for (const auto& pair : ir.GetBasicBlocks()) {
- const auto [address, bb] = pair;
+ const auto& [address, bb] = pair;
code.AddLine("case 0x{:X}U: {{", address);
++code.scope;
@@ -518,6 +527,9 @@ private:
if (device.HasImageLoadFormatted()) {
code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
}
+ if (device.HasTextureShadowLod()) {
+ code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+ }
if (device.HasWarpIntrinsics()) {
code.AddLine("#extension GL_NV_gpu_shader5 : require");
code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -590,8 +602,15 @@ private:
return;
}
const auto& info = registry.GetComputeInfo();
- if (const u32 size = info.shared_memory_size_in_words; size > 0) {
- code.AddLine("shared uint smem[{}];", size);
+ if (u32 size = info.shared_memory_size_in_words * 4; size > 0) {
+ const u32 limit = device.GetMaxComputeSharedMemorySize();
+ if (size > limit) {
+ LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}",
+ size, limit);
+ size = limit;
+ }
+
+ code.AddLine("shared uint smem[{}];", size / 4);
code.AddNewLine();
}
code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;",
@@ -618,7 +637,9 @@ private:
break;
}
}
- if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+ if (stage != ShaderType::Geometry &&
+ (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
if (ir.UsesLayer()) {
code.AddLine("int gl_Layer;");
}
@@ -647,6 +668,16 @@ private:
--code.scope;
code.AddLine("}};");
code.AddNewLine();
+
+ if (stage == ShaderType::Geometry) {
+ if (ir.UsesLayer()) {
+ code.AddLine("out int gl_Layer;");
+ }
+ if (ir.UsesViewportIndex()) {
+ code.AddLine("out int gl_ViewportIndex;");
+ }
+ }
+ code.AddNewLine();
}
void DeclareRegisters() {
@@ -782,7 +813,7 @@ private:
const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
const auto it = transform_feedback.find(location);
if (it == transform_feedback.end()) {
- return {};
+ return std::nullopt;
}
return it->second.components;
}
@@ -834,11 +865,24 @@ private:
}
void DeclareConstantBuffers() {
+ if (use_unified_uniforms) {
+ const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+ static_cast<u32>(ir.GetGlobalMemory().size());
+ code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+ binding);
+ code.AddLine(" uint cbufs[];");
+ code.AddLine("}};");
+ code.AddNewLine();
+ return;
+ }
+
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
- for (const auto& [index, cbuf] : ir.GetConstantBuffers()) {
+ for (const auto [index, info] : ir.GetConstantBuffers()) {
+ const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+ const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
GetConstBufferBlock(index));
- code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+ code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
code.AddLine("}};");
code.AddNewLine();
}
@@ -869,37 +913,37 @@ private:
for (const auto& sampler : ir.GetSamplers()) {
const std::string name = GetSampler(sampler);
const std::string description = fmt::format("layout (binding = {}) uniform", binding);
- binding += sampler.IsIndexed() ? sampler.Size() : 1;
+ binding += sampler.is_indexed ? sampler.size : 1;
std::string sampler_type = [&]() {
- if (sampler.IsBuffer()) {
+ if (sampler.is_buffer) {
return "samplerBuffer";
}
- switch (sampler.GetType()) {
- case Tegra::Shader::TextureType::Texture1D:
+ switch (sampler.type) {
+ case TextureType::Texture1D:
return "sampler1D";
- case Tegra::Shader::TextureType::Texture2D:
+ case TextureType::Texture2D:
return "sampler2D";
- case Tegra::Shader::TextureType::Texture3D:
+ case TextureType::Texture3D:
return "sampler3D";
- case Tegra::Shader::TextureType::TextureCube:
+ case TextureType::TextureCube:
return "samplerCube";
default:
UNREACHABLE();
return "sampler2D";
}
}();
- if (sampler.IsArray()) {
+ if (sampler.is_array) {
sampler_type += "Array";
}
- if (sampler.IsShadow()) {
+ if (sampler.is_shadow) {
sampler_type += "Shadow";
}
- if (!sampler.IsIndexed()) {
+ if (!sampler.is_indexed) {
code.AddLine("{} {} {};", description, sampler_type, name);
} else {
- code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size());
+ code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.size);
}
}
if (!ir.GetSamplers().empty()) {
@@ -945,14 +989,14 @@ private:
u32 binding = device.GetBaseBindings(stage).image;
for (const auto& image : ir.GetImages()) {
std::string qualifier = "coherent volatile";
- if (image.IsRead() && !image.IsWritten()) {
+ if (image.is_read && !image.is_written) {
qualifier += " readonly";
- } else if (image.IsWritten() && !image.IsRead()) {
+ } else if (image.is_written && !image.is_read) {
qualifier += " writeonly";
}
- const char* format = image.IsAtomic() ? "r32ui, " : "";
- const char* type_declaration = GetImageTypeDeclaration(image.GetType());
+ const char* format = image.is_atomic ? "r32ui, " : "";
+ const char* type_declaration = GetImageTypeDeclaration(image.type);
code.AddLine("layout ({}binding = {}) {} uniform uimage{} {};", format, binding++,
qualifier, type_declaration, GetImage(image));
}
@@ -1037,42 +1081,51 @@ private:
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
const Node offset = cbuf->GetOffset();
+ const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
// Direct access
const u32 offset_imm = immediate->GetValue();
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
- return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
- offset_imm / (4 * 4), (offset_imm / 4) % 4),
- Type::Uint};
+ if (use_unified_uniforms) {
+ return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+ Type::Uint};
+ } else {
+ return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+ offset_imm / (4 * 4), (offset_imm / 4) % 4),
+ Type::Uint};
+ }
}
- if (std::holds_alternative<OperationNode>(*offset)) {
- // Indirect access
- const std::string final_offset = code.GenerateTemporary();
- code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+ // Indirect access
+ if (use_unified_uniforms) {
+ return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+ Visit(offset).AsUint()),
+ Type::Uint};
+ }
- if (!device.HasComponentIndexingBug()) {
- return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
- final_offset, final_offset),
- Type::Uint};
- }
+ const std::string final_offset = code.GenerateTemporary();
+ code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
- // AMD's proprietary GLSL compiler emits ill code for variable component access.
- // To bypass this driver bug generate 4 ifs, one per each component.
- const std::string pack = code.GenerateTemporary();
- code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
- final_offset);
-
- const std::string result = code.GenerateTemporary();
- code.AddLine("uint {};", result);
- for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
- code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
- pack, GetSwizzle(swizzle));
- }
- return {result, Type::Uint};
+ if (!device.HasComponentIndexingBug()) {
+ return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+ final_offset, final_offset),
+ Type::Uint};
}
- UNREACHABLE_MSG("Unmanaged offset node type");
+ // AMD's proprietary GLSL compiler emits ill code for variable component access.
+ // To bypass this driver bug generate 4 ifs, one per each component.
+ const std::string pack = code.GenerateTemporary();
+ code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+ final_offset);
+
+ const std::string result = code.GenerateTemporary();
+ code.AddLine("uint {};", result);
+ for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+ code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+ GetSwizzle(swizzle));
+ }
+ return {result, Type::Uint};
}
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1144,6 +1197,7 @@ private:
return {"gl_FragCoord"s + GetSwizzle(element), Type::Float};
default:
UNREACHABLE();
+ return {"0", Type::Int};
}
case Attribute::Index::FrontColor:
return {"gl_Color"s + GetSwizzle(element), Type::Float};
@@ -1241,21 +1295,21 @@ private:
switch (element) {
case 0:
UNIMPLEMENTED();
- return {};
+ return std::nullopt;
case 1:
if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
- return {};
+ return std::nullopt;
}
return {{"gl_Layer", Type::Int}};
case 2:
if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) {
- return {};
+ return std::nullopt;
}
return {{"gl_ViewportIndex", Type::Int}};
case 3:
return {{"gl_PointSize", Type::Float}};
}
- return {};
+ return std::nullopt;
case Attribute::Index::FrontColor:
return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}};
case Attribute::Index::FrontSecondaryColor:
@@ -1278,7 +1332,7 @@ private:
Type::Float}};
}
UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
- return {};
+ return std::nullopt;
}
}
@@ -1335,16 +1389,27 @@ private:
ASSERT(meta);
const std::size_t count = operation.GetOperandsCount();
- const bool has_array = meta->sampler.IsArray();
- const bool has_shadow = meta->sampler.IsShadow();
+ const bool has_array = meta->sampler.is_array;
+ const bool has_shadow = meta->sampler.is_shadow;
+ const bool workaround_lod_array_shadow_as_grad =
+ !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+ ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+ meta->sampler.type == TextureType::TextureCube);
+
+ std::string expr = "texture";
+
+ if (workaround_lod_array_shadow_as_grad) {
+ expr += "Grad";
+ } else {
+ expr += function_suffix;
+ }
- std::string expr = "texture" + function_suffix;
if (!meta->aoffi.empty()) {
expr += "Offset";
} else if (!meta->ptp.empty()) {
expr += "Offsets";
}
- if (!meta->sampler.IsIndexed()) {
+ if (!meta->sampler.is_indexed) {
expr += '(' + GetSampler(meta->sampler) + ", ";
} else {
expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], ";
@@ -1372,6 +1437,18 @@ private:
expr += ')';
}
+ if (workaround_lod_array_shadow_as_grad) {
+ switch (meta->sampler.type) {
+ case TextureType::Texture2D:
+ return expr + ", vec2(0.0), vec2(0.0))";
+ case TextureType::TextureCube:
+ return expr + ", vec3(0.0), vec3(0.0))";
+ default:
+ UNREACHABLE();
+ break;
+ }
+ }
+
for (const auto& variant : extras) {
if (const auto argument = std::get_if<TextureArgument>(&variant)) {
expr += GenerateTextureArgument(*argument);
@@ -1482,8 +1559,8 @@ private:
dy += '(';
for (std::size_t index = 0; index < components; ++index) {
- const auto operand_x{derivates.at(index * 2)};
- const auto operand_y{derivates.at(index * 2 + 1)};
+ const auto& operand_x{derivates.at(index * 2)};
+ const auto& operand_y{derivates.at(index * 2 + 1)};
dx += Visit(operand_x).AsFloat();
dy += Visit(operand_y).AsFloat();
@@ -1536,7 +1613,9 @@ private:
Expression target;
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
if (gpr->GetIndex() == Register::ZeroIndex) {
- // Writing to Register::ZeroIndex is a no op
+ // Writing to Register::ZeroIndex is a no op but we still have to visit the source
+ // as it might have side effects.
+ code.AddLine("{};", Visit(src).GetCode());
return {};
}
target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -1838,38 +1917,48 @@ private:
Type::HalfFloat};
}
- template <Type type>
- Expression LogicalLessThan(Operation operation) {
- return GenerateBinaryInfix(operation, "<", Type::Bool, type, type);
- }
-
- template <Type type>
- Expression LogicalEqual(Operation operation) {
- return GenerateBinaryInfix(operation, "==", Type::Bool, type, type);
- }
+ template <const std::string_view& op, Type type, bool unordered = false>
+ Expression Comparison(Operation operation) {
+ static_assert(!unordered || type == Type::Float);
- template <Type type>
- Expression LogicalLessEqual(Operation operation) {
- return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type);
- }
+ Expression expr = GenerateBinaryInfix(operation, op, Type::Bool, type, type);
- template <Type type>
- Expression LogicalGreaterThan(Operation operation) {
- return GenerateBinaryInfix(operation, ">", Type::Bool, type, type);
+ if constexpr (op.compare("!=") == 0 && type == Type::Float && !unordered) {
+ // GLSL's operator!=(float, float) doesn't seem be ordered. This happens on both AMD's
+ // and Nvidia's proprietary stacks. Manually force an ordered comparison.
+ return {fmt::format("({} && !isnan({}) && !isnan({}))", expr.AsBool(),
+ VisitOperand(operation, 0).AsFloat(),
+ VisitOperand(operation, 1).AsFloat()),
+ Type::Bool};
+ }
+ if constexpr (!unordered) {
+ return expr;
+ }
+ // Unordered comparisons are always true for NaN operands.
+ return {fmt::format("({} || isnan({}) || isnan({}))", expr.AsBool(),
+ VisitOperand(operation, 0).AsFloat(),
+ VisitOperand(operation, 1).AsFloat()),
+ Type::Bool};
}
- template <Type type>
- Expression LogicalNotEqual(Operation operation) {
- return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type);
+ Expression FOrdered(Operation operation) {
+ return {fmt::format("(!isnan({}) && !isnan({}))", VisitOperand(operation, 0).AsFloat(),
+ VisitOperand(operation, 1).AsFloat()),
+ Type::Bool};
}
- template <Type type>
- Expression LogicalGreaterEqual(Operation operation) {
- return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type);
+ Expression FUnordered(Operation operation) {
+ return {fmt::format("(isnan({}) || isnan({}))", VisitOperand(operation, 0).AsFloat(),
+ VisitOperand(operation, 1).AsFloat()),
+ Type::Bool};
}
- Expression LogicalFIsNan(Operation operation) {
- return GenerateUnary(operation, "isnan", Type::Bool, Type::Float);
+ Expression LogicalAddCarry(Operation operation) {
+ const std::string carry = code.GenerateTemporary();
+ code.AddLine("uint {};", carry);
+ code.AddLine("uaddCarry({}, {}, {});", VisitOperand(operation, 0).AsUint(),
+ VisitOperand(operation, 1).AsUint(), carry);
+ return {fmt::format("({} != 0)", carry), Type::Bool};
}
Expression LogicalAssign(Operation operation) {
@@ -1967,24 +2056,39 @@ private:
}
Expression Texture(Operation operation) {
- const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
- ASSERT(meta);
-
- std::string expr = GenerateTexture(
- operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}});
- if (meta->sampler.IsShadow()) {
- expr = "vec4(" + expr + ')';
+ const auto meta = std::get<MetaTexture>(operation.GetMeta());
+ const bool separate_dc = meta.sampler.type == TextureType::TextureCube &&
+ meta.sampler.is_array && meta.sampler.is_shadow;
+ // TODO: Replace this with an array and make GenerateTexture use C++20 std::span
+ const std::vector<TextureIR> extras{
+ TextureOffset{},
+ TextureArgument{Type::Float, meta.bias},
+ };
+ std::string expr = GenerateTexture(operation, "", extras, separate_dc);
+ if (meta.sampler.is_shadow) {
+ expr = fmt::format("vec4({})", expr);
}
- return {expr + GetSwizzle(meta->element), Type::Float};
+ return {expr + GetSwizzle(meta.element), Type::Float};
}
Expression TextureLod(Operation operation) {
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
ASSERT(meta);
- std::string expr = GenerateTexture(
- operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
- if (meta->sampler.IsShadow()) {
+ std::string expr{};
+
+ if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+ ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+ meta->sampler.type == TextureType::TextureCube)) {
+ LOG_ERROR(Render_OpenGL,
+ "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+ expr = GenerateTexture(operation, "Lod", {});
+ } else {
+ expr = GenerateTexture(operation, "Lod",
+ {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+ }
+
+ if (meta->sampler.is_shadow) {
expr = "vec4(" + expr + ')';
}
return {expr + GetSwizzle(meta->element), Type::Float};
@@ -1993,11 +2097,11 @@ private:
Expression TextureGather(Operation operation) {
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
- const auto type = meta.sampler.IsShadow() ? Type::Float : Type::Int;
- const bool separate_dc = meta.sampler.IsShadow();
+ const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int;
+ const bool separate_dc = meta.sampler.is_shadow;
std::vector<TextureIR> ir;
- if (meta.sampler.IsShadow()) {
+ if (meta.sampler.is_shadow) {
ir = {TextureOffset{}};
} else {
ir = {TextureOffset{}, TextureArgument{type, meta.component}};
@@ -2042,7 +2146,7 @@ private:
constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"};
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
ASSERT(meta);
- UNIMPLEMENTED_IF(meta->sampler.IsArray());
+ UNIMPLEMENTED_IF(meta->sampler.is_array);
const std::size_t count = operation.GetOperandsCount();
std::string expr = "texelFetch(";
@@ -2063,7 +2167,7 @@ private:
}
expr += ')';
- if (meta->lod && !meta->sampler.IsBuffer()) {
+ if (meta->lod && !meta->sampler.is_buffer) {
expr += ", ";
expr += Visit(meta->lod).AsInt();
}
@@ -2074,12 +2178,10 @@ private:
}
Expression TextureGradient(Operation operation) {
- const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
- ASSERT(meta);
-
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
std::string expr =
GenerateTexture(operation, "Grad", {TextureDerivates{}, TextureOffset{}});
- return {std::move(expr) + GetSwizzle(meta->element), Type::Float};
+ return {std::move(expr) + GetSwizzle(meta.element), Type::Float};
}
Expression ImageLoad(Operation operation) {
@@ -2295,6 +2397,18 @@ private:
return {"gl_SubGroupInvocationARB", Type::Uint};
}
+ template <const std::string_view& comparison>
+ Expression ThreadMask(Operation) {
+ if (device.HasWarpIntrinsics()) {
+ return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
+ }
+ if (device.HasShaderBallot()) {
+ return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
+ }
+ LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
+ return {"0U", Type::Uint};
+ }
+
Expression ShuffleIndexed(Operation operation) {
std::string value = VisitOperand(operation, 0).AsFloat();
@@ -2307,7 +2421,21 @@ private:
return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
}
- Expression MemoryBarrierGL(Operation) {
+ Expression Barrier(Operation) {
+ if (!ir.IsDecompiled()) {
+ LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
+ return {};
+ }
+ code.AddLine("barrier();");
+ return {};
+ }
+
+ Expression MemoryBarrierGroup(Operation) {
+ code.AddLine("groupMemoryBarrier();");
+ return {};
+ }
+
+ Expression MemoryBarrierGlobal(Operation) {
code.AddLine("memoryBarrier();");
return {};
}
@@ -2316,6 +2444,19 @@ private:
Func() = delete;
~Func() = delete;
+ static constexpr std::string_view LessThan = "<";
+ static constexpr std::string_view Equal = "==";
+ static constexpr std::string_view LessEqual = "<=";
+ static constexpr std::string_view GreaterThan = ">";
+ static constexpr std::string_view NotEqual = "!=";
+ static constexpr std::string_view GreaterEqual = ">=";
+
+ static constexpr std::string_view Eq = "Eq";
+ static constexpr std::string_view Ge = "Ge";
+ static constexpr std::string_view Gt = "Gt";
+ static constexpr std::string_view Le = "Le";
+ static constexpr std::string_view Lt = "Lt";
+
static constexpr std::string_view Add = "Add";
static constexpr std::string_view Min = "Min";
static constexpr std::string_view Max = "Max";
@@ -2417,27 +2558,36 @@ private:
&GLSLDecompiler::LogicalPick2,
&GLSLDecompiler::LogicalAnd2,
- &GLSLDecompiler::LogicalLessThan<Type::Float>,
- &GLSLDecompiler::LogicalEqual<Type::Float>,
- &GLSLDecompiler::LogicalLessEqual<Type::Float>,
- &GLSLDecompiler::LogicalGreaterThan<Type::Float>,
- &GLSLDecompiler::LogicalNotEqual<Type::Float>,
- &GLSLDecompiler::LogicalGreaterEqual<Type::Float>,
- &GLSLDecompiler::LogicalFIsNan,
-
- &GLSLDecompiler::LogicalLessThan<Type::Int>,
- &GLSLDecompiler::LogicalEqual<Type::Int>,
- &GLSLDecompiler::LogicalLessEqual<Type::Int>,
- &GLSLDecompiler::LogicalGreaterThan<Type::Int>,
- &GLSLDecompiler::LogicalNotEqual<Type::Int>,
- &GLSLDecompiler::LogicalGreaterEqual<Type::Int>,
-
- &GLSLDecompiler::LogicalLessThan<Type::Uint>,
- &GLSLDecompiler::LogicalEqual<Type::Uint>,
- &GLSLDecompiler::LogicalLessEqual<Type::Uint>,
- &GLSLDecompiler::LogicalGreaterThan<Type::Uint>,
- &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
- &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, false>,
+ &GLSLDecompiler::Comparison<Func::Equal, Type::Float, false>,
+ &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, false>,
+ &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, false>,
+ &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, false>,
+ &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, false>,
+ &GLSLDecompiler::FOrdered,
+ &GLSLDecompiler::FUnordered,
+ &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, true>,
+ &GLSLDecompiler::Comparison<Func::Equal, Type::Float, true>,
+ &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, true>,
+ &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, true>,
+ &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, true>,
+ &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, true>,
+
+ &GLSLDecompiler::Comparison<Func::LessThan, Type::Int>,
+ &GLSLDecompiler::Comparison<Func::Equal, Type::Int>,
+ &GLSLDecompiler::Comparison<Func::LessEqual, Type::Int>,
+ &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Int>,
+ &GLSLDecompiler::Comparison<Func::NotEqual, Type::Int>,
+ &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Int>,
+
+ &GLSLDecompiler::Comparison<Func::LessThan, Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::Equal, Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::LessEqual, Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::NotEqual, Type::Uint>,
+ &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Uint>,
+
+ &GLSLDecompiler::LogicalAddCarry,
&GLSLDecompiler::Logical2HLessThan<false>,
&GLSLDecompiler::Logical2HEqual<false>,
@@ -2524,9 +2674,16 @@ private:
&GLSLDecompiler::VoteEqual,
&GLSLDecompiler::ThreadId,
+ &GLSLDecompiler::ThreadMask<Func::Eq>,
+ &GLSLDecompiler::ThreadMask<Func::Ge>,
+ &GLSLDecompiler::ThreadMask<Func::Gt>,
+ &GLSLDecompiler::ThreadMask<Func::Le>,
+ &GLSLDecompiler::ThreadMask<Func::Lt>,
&GLSLDecompiler::ShuffleIndexed,
- &GLSLDecompiler::MemoryBarrierGL,
+ &GLSLDecompiler::Barrier,
+ &GLSLDecompiler::MemoryBarrierGroup,
+ &GLSLDecompiler::MemoryBarrierGlobal,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2596,11 +2753,11 @@ private:
}
std::string GetSampler(const Sampler& sampler) const {
- return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
+ return AppendSuffix(sampler.index, "sampler");
}
std::string GetImage(const Image& image) const {
- return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
+ return AppendSuffix(image.index, "image");
}
std::string AppendSuffix(u32 index, std::string_view name) const {
@@ -2623,15 +2780,6 @@ private:
return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
}
- bool IsRenderTargetEnabled(u32 render_target) const {
- for (u32 component = 0; component < 4; ++component) {
- if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
- return true;
- }
- }
- return false;
- }
-
const Device& device;
const ShaderIR& ir;
const Registry& registry;
@@ -2639,6 +2787,7 @@ private:
const std::string_view identifier;
const std::string_view suffix;
const Header header;
+ const bool use_unified_uniforms;
std::unordered_map<u8, VaryingTFB> transform_feedback;
ShaderWriter code;
@@ -2834,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() {
} // Anonymous namespace
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
ShaderEntries entries;
for (const auto& cbuf : ir.GetConstantBuffers()) {
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2855,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
}
entries.shader_length = ir.GetLength();
+ entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
return entries;
}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e7dbd810c..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -33,36 +33,19 @@ public:
}
private:
- u32 index{};
+ u32 index = 0;
};
-class GlobalMemoryEntry {
-public:
- explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
+struct GlobalMemoryEntry {
+ constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read,
+ bool is_written)
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
is_written} {}
- u32 GetCbufIndex() const {
- return cbuf_index;
- }
-
- u32 GetCbufOffset() const {
- return cbuf_offset;
- }
-
- bool IsRead() const {
- return is_read;
- }
-
- bool IsWritten() const {
- return is_written;
- }
-
-private:
- u32 cbuf_index{};
- u32 cbuf_offset{};
- bool is_read{};
- bool is_written{};
+ u32 cbuf_index = 0;
+ u32 cbuf_offset = 0;
+ bool is_read = false;
+ bool is_written = false;
};
struct ShaderEntries {
@@ -70,11 +53,13 @@ struct ShaderEntries {
std::vector<GlobalMemoryEntry> global_memory_entries;
std::vector<SamplerEntry> samplers;
std::vector<ImageEntry> images;
- u32 clip_distances{};
std::size_t shader_length{};
+ u32 clip_distances{};
+ bool use_unified_uniforms{};
};
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ Tegra::Engines::ShaderType stage);
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..70dd0c3c6 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
namespace {
+using VideoCommon::Shader::SeparateSamplerKey;
+
using ShaderCacheVersionHash = std::array<u8, 64>;
struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
u32 value = 0;
};
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
u32 offset = 0;
Tegra::Engines::SamplerDescriptor sampler;
};
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+ u32 cbuf1 = 0;
+ u32 cbuf2 = 0;
+ u32 offset1 = 0;
+ u32 offset2 = 0;
+ Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
u32 cbuf = 0;
u32 offset = 0;
Tegra::Engines::SamplerDescriptor sampler;
};
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
ShaderCacheVersionHash GetShaderCacheVersionHash() {
ShaderCacheVersionHash hash{};
@@ -63,7 +73,7 @@ ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default;
ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default;
-bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
+bool ShaderDiskCacheEntry::Load(Common::FS::IOFile& file) {
if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
return false;
}
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
u32 texture_handler_size_value;
u32 num_keys;
u32 num_bound_samplers;
+ u32 num_separate_samplers;
u32 num_bindless_samplers;
if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
file.ReadArray(&texture_handler_size_value, 1) != 1 ||
file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+ file.ReadArray(&num_separate_samplers, 1) != 1 ||
file.ReadArray(&num_bindless_samplers, 1) != 1) {
return false;
}
@@ -101,29 +113,38 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
}
std::vector<ConstBufferKey> flat_keys(num_keys);
- std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
- std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+ std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+ std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+ std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
flat_bound_samplers.size() ||
+ file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+ flat_separate_samplers.size() ||
file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
flat_bindless_samplers.size()) {
return false;
}
- for (const auto& key : flat_keys) {
- keys.insert({{key.cbuf, key.offset}, key.value});
+ for (const auto& entry : flat_keys) {
+ keys.insert({{entry.cbuf, entry.offset}, entry.value});
+ }
+ for (const auto& entry : flat_bound_samplers) {
+ bound_samplers.emplace(entry.offset, entry.sampler);
}
- for (const auto& key : flat_bound_samplers) {
- bound_samplers.emplace(key.offset, key.sampler);
+ for (const auto& entry : flat_separate_samplers) {
+ SeparateSamplerKey key;
+ key.buffers = {entry.cbuf1, entry.cbuf2};
+ key.offsets = {entry.offset1, entry.offset2};
+ separate_samplers.emplace(key, entry.sampler);
}
- for (const auto& key : flat_bindless_samplers) {
- bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+ for (const auto& entry : flat_bindless_samplers) {
+ bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
}
return true;
}
-bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
+bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const {
if (file.WriteObject(static_cast<u32>(type)) != 1 ||
file.WriteObject(static_cast<u32>(code.size())) != 1 ||
file.WriteObject(static_cast<u32>(code_b.size())) != 1) {
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+ file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
return false;
}
@@ -152,48 +174,64 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
}
- std::vector<BoundSamplerKey> flat_bound_samplers;
+ std::vector<BoundSamplerEntry> flat_bound_samplers;
flat_bound_samplers.reserve(bound_samplers.size());
for (const auto& [address, sampler] : bound_samplers) {
- flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+ flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
}
- std::vector<BindlessSamplerKey> flat_bindless_samplers;
+ std::vector<SeparateSamplerEntry> flat_separate_samplers;
+ flat_separate_samplers.reserve(separate_samplers.size());
+ for (const auto& [key, sampler] : separate_samplers) {
+ SeparateSamplerEntry entry;
+ std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+ std::tie(entry.offset1, entry.offset2) = key.offsets;
+ entry.sampler = sampler;
+ flat_separate_samplers.push_back(entry);
+ }
+
+ std::vector<BindlessSamplerEntry> flat_bindless_samplers;
flat_bindless_samplers.reserve(bindless_samplers.size());
for (const auto& [address, sampler] : bindless_samplers) {
flat_bindless_samplers.push_back(
- BindlessSamplerKey{address.first, address.second, sampler});
+ BindlessSamplerEntry{address.first, address.second, sampler});
}
return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
flat_bound_samplers.size() &&
+ file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+ flat_separate_samplers.size() &&
file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
flat_bindless_samplers.size();
}
-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default;
ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
+void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) {
+ title_id = title_id_;
+}
+
std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
// Skip games without title id
- const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
- if (!Settings::values.use_disk_shader_cache || !has_title_id) {
- return {};
+ const bool has_title_id = title_id != 0;
+ if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) {
+ return std::nullopt;
}
- FileUtil::IOFile file(GetTransferablePath(), "rb");
+ Common::FS::IOFile file(GetTransferablePath(), "rb");
if (!file.IsOpen()) {
LOG_INFO(Render_OpenGL, "No transferable shader cache found");
is_usable = true;
- return {};
+ return std::nullopt;
}
u32 version{};
if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it");
- return {};
+ return std::nullopt;
}
if (version < NativeVersion) {
@@ -201,12 +239,12 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran
file.Close();
InvalidateTransferable();
is_usable = true;
- return {};
+ return std::nullopt;
}
if (version > NativeVersion) {
LOG_WARNING(Render_OpenGL, "Transferable shader cache was generated with a newer version "
"of the emulator, skipping");
- return {};
+ return std::nullopt;
}
// Version is valid, load the shaders
@@ -215,7 +253,7 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran
ShaderDiskCacheEntry& entry = entries.emplace_back();
if (!entry.Load(file)) {
LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping");
- return {};
+ return std::nullopt;
}
}
@@ -228,7 +266,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled()
return {};
}
- FileUtil::IOFile file(GetPrecompiledPath(), "rb");
+ Common::FS::IOFile file(GetPrecompiledPath(), "rb");
if (!file.IsOpen()) {
LOG_INFO(Render_OpenGL, "No precompiled shader cache found");
return {};
@@ -245,7 +283,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled()
}
std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(
- FileUtil::IOFile& file) {
+ Common::FS::IOFile& file) {
// Read compressed file from disk and decompress to virtual precompiled cache file
std::vector<u8> compressed(file.GetSize());
file.ReadBytes(compressed.data(), compressed.size());
@@ -256,12 +294,12 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo
ShaderCacheVersionHash file_hash{};
if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) {
precompiled_cache_virtual_file_offset = 0;
- return {};
+ return std::nullopt;
}
if (GetShaderCacheVersionHash() != file_hash) {
LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator");
precompiled_cache_virtual_file_offset = 0;
- return {};
+ return std::nullopt;
}
std::vector<ShaderDiskCachePrecompiled> entries;
@@ -271,19 +309,19 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo
if (!LoadObjectFromPrecompiled(entry.unique_identifier) ||
!LoadObjectFromPrecompiled(entry.binary_format) ||
!LoadObjectFromPrecompiled(binary_size)) {
- return {};
+ return std::nullopt;
}
entry.binary.resize(binary_size);
if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) {
- return {};
+ return std::nullopt;
}
}
return entries;
}
void ShaderDiskCacheOpenGL::InvalidateTransferable() {
- if (!FileUtil::Delete(GetTransferablePath())) {
+ if (!Common::FS::Delete(GetTransferablePath())) {
LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}",
GetTransferablePath());
}
@@ -294,7 +332,7 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
// Clear virtaul precompiled cache file
precompiled_cache_virtual_file.Resize(0);
- if (!FileUtil::Delete(GetPrecompiledPath())) {
+ if (!Common::FS::Delete(GetPrecompiledPath())) {
LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath());
}
}
@@ -310,7 +348,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) {
return;
}
- FileUtil::IOFile file = AppendTransferableFile();
+ Common::FS::IOFile file = AppendTransferableFile();
if (!file.IsOpen()) {
return;
}
@@ -352,15 +390,15 @@ void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint progra
}
}
-FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
+Common::FS::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
if (!EnsureDirectories()) {
return {};
}
const auto transferable_path{GetTransferablePath()};
- const bool existed = FileUtil::Exists(transferable_path);
+ const bool existed = Common::FS::Exists(transferable_path);
- FileUtil::IOFile file(transferable_path, "ab");
+ Common::FS::IOFile file(transferable_path, "ab");
if (!file.IsOpen()) {
LOG_ERROR(Render_OpenGL, "Failed to open transferable cache in path={}", transferable_path);
return {};
@@ -392,7 +430,7 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size());
const auto precompiled_path{GetPrecompiledPath()};
- FileUtil::IOFile file(precompiled_path, "wb");
+ Common::FS::IOFile file(precompiled_path, "wb");
if (!file.IsOpen()) {
LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path);
@@ -406,24 +444,24 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
bool ShaderDiskCacheOpenGL::EnsureDirectories() const {
const auto CreateDir = [](const std::string& dir) {
- if (!FileUtil::CreateDir(dir)) {
+ if (!Common::FS::CreateDir(dir)) {
LOG_ERROR(Render_OpenGL, "Failed to create directory={}", dir);
return false;
}
return true;
};
- return CreateDir(FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir)) &&
+ return CreateDir(Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir)) &&
CreateDir(GetBaseDir()) && CreateDir(GetTransferableDir()) &&
CreateDir(GetPrecompiledDir());
}
std::string ShaderDiskCacheOpenGL::GetTransferablePath() const {
- return FileUtil::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
+ return Common::FS::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
}
std::string ShaderDiskCacheOpenGL::GetPrecompiledPath() const {
- return FileUtil::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
+ return Common::FS::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin");
}
std::string ShaderDiskCacheOpenGL::GetTransferableDir() const {
@@ -435,11 +473,11 @@ std::string ShaderDiskCacheOpenGL::GetPrecompiledDir() const {
}
std::string ShaderDiskCacheOpenGL::GetBaseDir() const {
- return FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir) + DIR_SEP "opengl";
+ return Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir) + DIR_SEP "opengl";
}
std::string ShaderDiskCacheOpenGL::GetTitleID() const {
- return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID());
+ return fmt::format("{:016X}", title_id);
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..aef841c1d 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -21,11 +21,7 @@
#include "video_core/engines/shader_type.h"
#include "video_core/shader/registry.h"
-namespace Core {
-class System;
-}
-
-namespace FileUtil {
+namespace Common::FS {
class IOFile;
}
@@ -38,9 +34,9 @@ struct ShaderDiskCacheEntry {
ShaderDiskCacheEntry();
~ShaderDiskCacheEntry();
- bool Load(FileUtil::IOFile& file);
+ bool Load(Common::FS::IOFile& file);
- bool Save(FileUtil::IOFile& file) const;
+ bool Save(Common::FS::IOFile& file) const;
bool HasProgramA() const {
return !code.empty() && !code_b.empty();
@@ -57,6 +53,7 @@ struct ShaderDiskCacheEntry {
VideoCommon::Shader::ComputeInfo compute_info;
VideoCommon::Shader::KeyMap keys;
VideoCommon::Shader::BoundSamplerMap bound_samplers;
+ VideoCommon::Shader::SeparateSamplerMap separate_samplers;
VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
};
@@ -69,9 +66,12 @@ struct ShaderDiskCachePrecompiled {
class ShaderDiskCacheOpenGL {
public:
- explicit ShaderDiskCacheOpenGL(Core::System& system);
+ explicit ShaderDiskCacheOpenGL();
~ShaderDiskCacheOpenGL();
+ /// Binds a title ID for all future operations.
+ void BindTitleID(u64 title_id);
+
/// Loads transferable cache. If file has a old version or on failure, it deletes the file.
std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();
@@ -96,10 +96,10 @@ public:
private:
/// Loads the transferable cache. Returns empty on failure.
std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile(
- FileUtil::IOFile& file);
+ Common::FS::IOFile& file);
/// Opens current game's transferable file and write it's header if it doesn't exist
- FileUtil::IOFile AppendTransferableFile() const;
+ Common::FS::IOFile AppendTransferableFile() const;
/// Save precompiled header to precompiled_cache_in_memory
void SavePrecompiledHeaderToVirtualPrecompiledCache();
@@ -156,8 +156,6 @@ private:
return LoadArrayFromPrecompiled(&object, 1);
}
- Core::System& system;
-
// Stores whole precompiled cache which will be read from or saved to the precompiled chache
// file
FileSys::VectorVfsFile precompiled_cache_virtual_file;
@@ -167,8 +165,11 @@ private:
// Stored transferable shaders
std::unordered_set<u64> stored_transferable;
+ /// Title ID to operate on
+ u64 title_id = 0;
+
// The cache has been loaded at boot
- bool is_usable{};
+ bool is_usable = false;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 9c7b0adbd..691c6c79b 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,47 +6,124 @@
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
-namespace OpenGL::GLShader {
+namespace OpenGL {
-ProgramManager::ProgramManager() = default;
+namespace {
+
+void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) {
+ if (current == old) {
+ return;
+ }
+ if (current == 0) {
+ if (enabled) {
+ enabled = false;
+ glDisable(stage);
+ }
+ return;
+ }
+ if (!enabled) {
+ enabled = true;
+ glEnable(stage);
+ }
+ glBindProgramARB(stage, current);
+}
+
+} // Anonymous namespace
+
+ProgramManager::ProgramManager(const Device& device)
+ : use_assembly_programs{device.UseAssemblyShaders()} {
+ if (use_assembly_programs) {
+ glEnable(GL_COMPUTE_PROGRAM_NV);
+ } else {
+ graphics_pipeline.Create();
+ glBindProgramPipeline(graphics_pipeline.handle);
+ }
+}
ProgramManager::~ProgramManager() = default;
-void ProgramManager::Create() {
- graphics_pipeline.Create();
- glBindProgramPipeline(graphics_pipeline.handle);
+void ProgramManager::BindCompute(GLuint program) {
+ if (use_assembly_programs) {
+ glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+ } else {
+ is_graphics_bound = false;
+ glUseProgram(program);
+ }
}
void ProgramManager::BindGraphicsPipeline() {
+ if (!use_assembly_programs) {
+ UpdateSourcePrograms();
+ }
+}
+
+void ProgramManager::BindHostPipeline(GLuint pipeline) {
+ if (use_assembly_programs) {
+ if (geometry_enabled) {
+ geometry_enabled = false;
+ old_state.geometry = 0;
+ glDisable(GL_GEOMETRY_PROGRAM_NV);
+ }
+ } else {
+ if (!is_graphics_bound) {
+ glUseProgram(0);
+ }
+ }
+ glBindProgramPipeline(pipeline);
+}
+
+void ProgramManager::RestoreGuestPipeline() {
+ if (use_assembly_programs) {
+ glBindProgramPipeline(0);
+ } else {
+ glBindProgramPipeline(graphics_pipeline.handle);
+ }
+}
+
+void ProgramManager::UseVertexShader(GLuint program) {
+ if (use_assembly_programs) {
+ BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled);
+ }
+ current_state.vertex = program;
+}
+
+void ProgramManager::UseGeometryShader(GLuint program) {
+ if (use_assembly_programs) {
+ BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled);
+ }
+ current_state.geometry = program;
+}
+
+void ProgramManager::UseFragmentShader(GLuint program) {
+ if (use_assembly_programs) {
+ BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled);
+ }
+ current_state.fragment = program;
+}
+
+void ProgramManager::UpdateSourcePrograms() {
if (!is_graphics_bound) {
is_graphics_bound = true;
glUseProgram(0);
}
- // Avoid updating the pipeline when values have no changed
- if (old_state == current_state) {
- return;
- }
-
- // Workaround for AMD bug
- static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
- GL_FRAGMENT_SHADER_BIT};
const GLuint handle = graphics_pipeline.handle;
- glUseProgramStages(handle, all_used_stages, 0);
- glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
- glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
- glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+ const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
+ if (current == old) {
+ return;
+ }
+ glUseProgramStages(handle, stage, current);
+ };
+ update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
+ update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
+ update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
old_state = current_state;
}
-void ProgramManager::BindComputeShader(GLuint program) {
- is_graphics_bound = false;
- glUseProgram(program);
-}
-
void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
const auto& regs = maxwell.regs;
@@ -54,4 +131,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
}
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index d2e47f2a9..950e0dfcb 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
-namespace OpenGL::GLShader {
+namespace OpenGL {
+
+class Device;
/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,47 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
class ProgramManager {
public:
- explicit ProgramManager();
+ explicit ProgramManager(const Device& device);
~ProgramManager();
- void Create();
+ /// Binds a compute program
+ void BindCompute(GLuint program);
- /// Updates the graphics pipeline and binds it.
+ /// Updates bound programs.
void BindGraphicsPipeline();
- /// Binds a compute shader.
- void BindComputeShader(GLuint program);
-
- void UseVertexShader(GLuint program) {
- current_state.vertex_shader = program;
- }
+ /// Binds an OpenGL pipeline object unsynchronized with the guest state.
+ void BindHostPipeline(GLuint pipeline);
- void UseGeometryShader(GLuint program) {
- current_state.geometry_shader = program;
- }
+ /// Rewinds BindHostPipeline state changes.
+ void RestoreGuestPipeline();
- void UseFragmentShader(GLuint program) {
- current_state.fragment_shader = program;
- }
+ void UseVertexShader(GLuint program);
+ void UseGeometryShader(GLuint program);
+ void UseFragmentShader(GLuint program);
private:
struct PipelineState {
- bool operator==(const PipelineState& rhs) const noexcept {
- return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
- geometry_shader == rhs.geometry_shader;
- }
-
- bool operator!=(const PipelineState& rhs) const noexcept {
- return !operator==(rhs);
- }
-
- GLuint vertex_shader = 0;
- GLuint fragment_shader = 0;
- GLuint geometry_shader = 0;
+ GLuint vertex = 0;
+ GLuint geometry = 0;
+ GLuint fragment = 0;
};
+ /// Update GLSL programs.
+ void UpdateSourcePrograms();
+
OGLPipeline graphics_pipeline;
- OGLPipeline compute_pipeline;
+
PipelineState current_state;
PipelineState old_state;
+
+ bool use_assembly_programs = false;
+
bool is_graphics_bound = true;
+
+ bool vertex_enabled = false;
+ bool geometry_enabled = false;
+ bool fragment_enabled = false;
};
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 9e74eda0d..4bf0d6090 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <string_view>
#include <vector>
#include <glad/glad.h>
#include "common/assert.h"
@@ -11,7 +12,8 @@
namespace OpenGL::GLShader {
namespace {
-const char* GetStageDebugName(GLenum type) {
+
+std::string_view StageDebugName(GLenum type) {
switch (type) {
case GL_VERTEX_SHADER:
return "vertex";
@@ -25,12 +27,17 @@ const char* GetStageDebugName(GLenum type) {
UNIMPLEMENTED();
return "unknown";
}
+
} // Anonymous namespace
-GLuint LoadShader(const char* source, GLenum type) {
- const char* debug_type = GetStageDebugName(type);
+GLuint LoadShader(std::string_view source, GLenum type) {
+ const std::string_view debug_type = StageDebugName(type);
const GLuint shader_id = glCreateShader(type);
- glShaderSource(shader_id, 1, &source, nullptr);
+
+ const GLchar* source_string = source.data();
+ const GLint source_length = static_cast<GLint>(source.size());
+
+ glShaderSource(shader_id, 1, &source_string, &source_length);
LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
glCompileShader(shader_id);
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 03b7548c2..1b770532e 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -38,7 +38,7 @@ void LogShaderSource(T... shaders) {
* @param source String of the GLSL shader program
* @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER)
*/
-GLuint LoadShader(const char* source, GLenum type);
+GLuint LoadShader(std::string_view source, GLenum type);
/**
* Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader)
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index d24fad3de..6bcf831f2 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -214,10 +214,8 @@ void SetupDirtyMisc(Tables& tables) {
} // Anonymous namespace
-StateTracker::StateTracker(Core::System& system) : system{system} {}
-
-void StateTracker::Initialize() {
- auto& dirty = system.GPU().Maxwell3D().dirty;
+StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
+ auto& dirty = gpu.Maxwell3D().dirty;
auto& tables = dirty.tables;
SetupDirtyRenderTargets(tables);
SetupDirtyColorMasks(tables);
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 0f823288e..9d127548f 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -13,8 +13,8 @@
#include "video_core/dirty_flags.h"
#include "video_core/engines/maxwell_3d.h"
-namespace Core {
-class System;
+namespace Tegra {
+class GPU;
}
namespace OpenGL {
@@ -90,9 +90,7 @@ static_assert(Last <= std::numeric_limits<u8>::max());
class StateTracker {
public:
- explicit StateTracker(Core::System& system);
-
- void Initialize();
+ explicit StateTracker(Tegra::GPU& gpu);
void BindIndexBuffer(GLuint new_index_buffer) {
if (index_buffer == new_index_buffer) {
@@ -103,7 +101,6 @@ public:
}
void NotifyScreenDrawVertexArray() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::VertexFormats] = true;
flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
@@ -117,98 +114,81 @@ public:
}
void NotifyPolygonModes() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::PolygonModes] = true;
flags[OpenGL::Dirty::PolygonModeFront] = true;
flags[OpenGL::Dirty::PolygonModeBack] = true;
}
void NotifyViewport0() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::Viewports] = true;
flags[OpenGL::Dirty::Viewport0] = true;
}
void NotifyScissor0() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::Scissors] = true;
flags[OpenGL::Dirty::Scissor0] = true;
}
void NotifyColorMask0() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::ColorMasks] = true;
flags[OpenGL::Dirty::ColorMask0] = true;
}
void NotifyBlend0() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::BlendStates] = true;
flags[OpenGL::Dirty::BlendState0] = true;
}
void NotifyFramebuffer() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[VideoCommon::Dirty::RenderTargets] = true;
}
void NotifyFrontFace() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::FrontFace] = true;
}
void NotifyCullTest() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::CullTest] = true;
}
void NotifyDepthMask() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::DepthMask] = true;
}
void NotifyDepthTest() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::DepthTest] = true;
}
void NotifyStencilTest() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::StencilTest] = true;
}
void NotifyPolygonOffset() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::PolygonOffset] = true;
}
void NotifyRasterizeEnable() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::RasterizeEnable] = true;
}
void NotifyFramebufferSRGB() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::FramebufferSRGB] = true;
}
void NotifyLogicOp() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::LogicOp] = true;
}
void NotifyClipControl() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::ClipControl] = true;
}
void NotifyAlphaTest() {
- auto& flags = system.GPU().Maxwell3D().dirty.flags;
flags[OpenGL::Dirty::AlphaTest] = true;
}
private:
- Core::System& system;
+ Tegra::Engines::Maxwell3D::DirtyState::Flags& flags;
GLuint index_buffer = 0;
};
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..887995cf4 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <deque>
+#include <tuple>
#include <vector>
+
#include "common/alignment.h"
#include "common/assert.h"
#include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
namespace OpenGL {
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
- bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
: buffer_size(size) {
gl_buffer.Create();
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
allocate_size *= 2;
}
- if (use_persistent) {
- persistent = true;
- coherent = prefer_coherent;
- const GLbitfield flags =
- GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
- glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
- mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
- gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
- } else {
- glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+ static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+ glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+ mapped_ptr = static_cast<u8*>(
+ glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+ if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
+ glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+ glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}
}
OGLStreamBuffer::~OGLStreamBuffer() {
- if (persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
- }
+ glUnmapNamedBuffer(gl_buffer.handle);
gl_buffer.Release();
}
-GLuint OGLStreamBuffer::GetHandle() const {
- return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
- return buffer_size;
-}
-
std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
ASSERT(size <= buffer_size);
ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
bool invalidate = false;
if (buffer_pos + size > buffer_size) {
+ MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+ glInvalidateBufferData(gl_buffer.handle);
+
buffer_pos = 0;
invalidate = true;
-
- if (persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
- }
}
- if (invalidate || !persistent) {
- MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
- GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
- (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
- (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
- mapped_ptr = static_cast<u8*>(
- glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
- mapped_offset = buffer_pos;
- }
-
- return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+ return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
}
void OGLStreamBuffer::Unmap(GLsizeiptr size) {
ASSERT(size <= mapped_size);
- if (!coherent && size > 0) {
- glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
- }
-
- if (!persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
+ if (size > 0) {
+ glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
}
buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
namespace OpenGL {
+class Device;
+
class OGLStreamBuffer : private NonCopyable {
public:
- explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
- bool use_persistent = true);
+ explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
~OGLStreamBuffer();
- GLuint GetHandle() const;
- GLsizeiptr GetSize() const;
-
/*
* Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
* and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
void Unmap(GLsizeiptr size);
+ GLuint Handle() const {
+ return gl_buffer.handle;
+ }
+
+ u64 Address() const {
+ return gpu_address;
+ }
+
+ GLsizeiptr Size() const noexcept {
+ return buffer_size;
+ }
+
private:
OGLBuffer gl_buffer;
- bool coherent = false;
- bool persistent = false;
-
+ GLuint64EXT gpu_address = 0;
GLintptr buffer_pos = 0;
GLsizeiptr buffer_size = 0;
- GLintptr mapped_offset = 0;
GLsizeiptr mapped_size = 0;
u8* mapped_ptr = nullptr;
};
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 2729d1265..a863ef218 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,96 +35,109 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
namespace {
struct FormatTuple {
- GLint internal_format;
+ GLenum internal_format;
GLenum format = GL_NONE;
GLenum type = GL_NONE;
};
constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{
- {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // ABGR8U
- {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // ABGR8S
- {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // ABGR8UI
- {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5U
- {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10U
- {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5U
- {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8U
- {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8UI
- {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // RGBA16F
- {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // RGBA16U
- {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // RGBA16S
- {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // RGBA16UI
- {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // R11FG11FB10F
- {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // RGBA32UI
- {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // DXT1
- {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // DXT23
- {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // DXT45
- {GL_COMPRESSED_RED_RGTC1}, // DXN1
- {GL_COMPRESSED_RG_RGTC2}, // DXN2UNORM
- {GL_COMPRESSED_SIGNED_RG_RGTC2}, // DXN2SNORM
- {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7U
- {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UF16
- {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SF16
- {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4
- {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8
- {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // RGBA32F
- {GL_RG32F, GL_RG, GL_FLOAT}, // RG32F
- {GL_R32F, GL_RED, GL_FLOAT}, // R32F
- {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16F
- {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16U
- {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16S
- {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16UI
- {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16I
- {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // RG16
- {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // RG16F
- {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // RG16UI
- {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // RG16I
- {GL_RG16_SNORM, GL_RG, GL_SHORT}, // RG16S
- {GL_RGB32F, GL_RGB, GL_FLOAT}, // RGB32F
- {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // RGBA8_SRGB
- {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8U
- {GL_RG8_SNORM, GL_RG, GL_BYTE}, // RG8S
- {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // RG32UI
- {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // RGBX16F
- {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32UI
- {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32I
- {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8
- {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5
- {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4
- {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8
+ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_UNORM
+ {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // A8B8G8R8_SNORM
+ {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE}, // A8B8G8R8_SINT
+ {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // A8B8G8R8_UINT
+ {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // R5G6B5_UNORM
+ {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5_UNORM
+ {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1R5G5B5_UNORM
+ {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UNORM
+ {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT
+ {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5_UNORM
+ {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8_UNORM
+ {GL_R8_SNORM, GL_RED, GL_BYTE}, // R8_SNORM
+ {GL_R8I, GL_RED_INTEGER, GL_BYTE}, // R8_SINT
+ {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8_UINT
+ {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16A16_FLOAT
+ {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // R16G16B16A16_UNORM
+ {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // R16G16B16A16_SNORM
+ {GL_RGBA16I, GL_RGBA_INTEGER, GL_SHORT}, // R16G16B16A16_SINT
+ {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // R16G16B16A16_UINT
+ {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // B10G11R11_FLOAT
+ {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // R32G32B32A32_UINT
+ {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // BC1_RGBA_UNORM
+ {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // BC2_UNORM
+ {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // BC3_UNORM
+ {GL_COMPRESSED_RED_RGTC1}, // BC4_UNORM
+ {GL_COMPRESSED_SIGNED_RED_RGTC1}, // BC4_SNORM
+ {GL_COMPRESSED_RG_RGTC2}, // BC5_UNORM
+ {GL_COMPRESSED_SIGNED_RG_RGTC2}, // BC5_SNORM
+ {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7_UNORM
+ {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UFLOAT
+ {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SFLOAT
+ {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4_UNORM
+ {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM
+ {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // R32G32B32A32_FLOAT
+ {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT}, // R32G32B32A32_SINT
+ {GL_RG32F, GL_RG, GL_FLOAT}, // R32G32_FLOAT
+ {GL_RG32I, GL_RG_INTEGER, GL_INT}, // R32G32_SINT
+ {GL_R32F, GL_RED, GL_FLOAT}, // R32_FLOAT
+ {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16_FLOAT
+ {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16_UNORM
+ {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16_SNORM
+ {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16_UINT
+ {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16_SINT
+ {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // R16G16_UNORM
+ {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // R16G16_FLOAT
+ {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // R16G16_UINT
+ {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // R16G16_SINT
+ {GL_RG16_SNORM, GL_RG, GL_SHORT}, // R16G16_SNORM
+ {GL_RGB32F, GL_RGB, GL_FLOAT}, // R32G32B32_FLOAT
+ {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_SRGB
+ {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // R8G8_UNORM
+ {GL_RG8_SNORM, GL_RG, GL_BYTE}, // R8G8_SNORM
+ {GL_RG8I, GL_RG_INTEGER, GL_BYTE}, // R8G8_SINT
+ {GL_RG8UI, GL_RG_INTEGER, GL_UNSIGNED_BYTE}, // R8G8_UINT
+ {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // R32G32_UINT
+ {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16X16_FLOAT
+ {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32_UINT
+ {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32_SINT
+ {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8_UNORM
+ {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM
+ {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM
+ {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM
// Compressed sRGB formats
- {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // DXT1_SRGB
- {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // DXT23_SRGB
- {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // DXT45_SRGB
- {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7U_SRGB
- {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // R4G4B4A4U
+ {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB
+ {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB
+ {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB
+ {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB
+ {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB
- {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5
+ {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB
- {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8
+ {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB
- {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6
+ {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB
- {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10
+ {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB
- {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12
+ {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB
- {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6
+ {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB
- {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5
+ {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM
{GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB
- {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9F
+ {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT
// Depth formats
- {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // Z32F
- {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // Z16
+ {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT
+ {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM
// DepthStencil formats
- {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // Z24S8
- {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8Z24
- {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // Z32FS8
+ {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT
+ {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM
+ {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL,
+ GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT
}};
const FormatTuple& GetFormatTuple(PixelFormat pixel_format) {
@@ -177,10 +190,10 @@ GLint GetSwizzleSource(SwizzleSource source) {
GLenum GetComponent(PixelFormat format, bool is_first) {
switch (format) {
- case PixelFormat::Z24S8:
- case PixelFormat::Z32FS8:
+ case PixelFormat::D24_UNORM_S8_UINT:
+ case PixelFormat::D32_FLOAT_S8_UINT:
return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
- case PixelFormat::S8Z24:
+ case PixelFormat::S8_UINT_D24_UNORM:
return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
default:
UNREACHABLE();
@@ -237,6 +250,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
return texture;
}
+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+ SwizzleSource w_source) {
+ return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+ (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
} // Anonymous namespace
CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -256,9 +275,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
target = GetTextureTarget(params.target);
texture = CreateTexture(params, target, internal_format, texture_buffer);
DecorateSurfaceName();
- main_view = CreateViewInner(
- ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
- true);
+
+ u32 num_layers = 1;
+ if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+ num_layers = params.depth;
+ }
+
+ main_view =
+ CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
}
CachedSurface::~CachedSurface() = default;
@@ -379,8 +403,8 @@ void CachedSurface::DecorateSurfaceName() {
LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName());
}
-void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
- LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) {
+ LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
}
View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -396,32 +420,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
}
CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
- const bool is_proxy)
- : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
- target = GetTextureTarget(params.target);
- format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+ bool is_proxy)
+ : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
+ target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
if (!is_proxy) {
- texture_view = CreateTextureView();
+ main_view = CreateTextureView();
}
- swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
}
CachedSurfaceView::~CachedSurfaceView() = default;
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
ASSERT(params.num_levels == 1);
+ if (params.target == SurfaceTarget::Texture3D) {
+ if (params.num_layers > 1) {
+ ASSERT(params.base_layer == 0);
+ glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+ } else {
+ glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+ params.base_level, params.base_layer);
+ }
+ return;
+ }
+
if (params.num_layers > 1) {
- // Layered framebuffer attachments
UNIMPLEMENTED_IF(params.base_layer != 0);
-
- switch (params.target) {
- case SurfaceTarget::Texture2DArray:
- glFramebufferTexture(target, attachment, GetTexture(), 0);
- break;
- default:
- UNIMPLEMENTED();
- }
+ glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
return;
}
@@ -429,16 +454,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
const GLuint texture = surface.GetTexture();
switch (surface.GetSurfaceParams().target) {
case SurfaceTarget::Texture1D:
- glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture2D:
- glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture1DArray:
case SurfaceTarget::Texture2DArray:
case SurfaceTarget::TextureCubemap:
case SurfaceTarget::TextureCubeArray:
- glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+ glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
params.base_layer);
break;
default:
@@ -446,44 +471,73 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
}
}
-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
SwizzleSource z_source, SwizzleSource w_source) {
- u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
- if (new_swizzle == swizzle)
- return;
- swizzle = new_swizzle;
- const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
- GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
- const GLuint handle = GetTexture();
- const PixelFormat format = surface.GetSurfaceParams().pixel_format;
- switch (format) {
- case PixelFormat::Z24S8:
- case PixelFormat::Z32FS8:
- case PixelFormat::S8Z24:
- glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+ if (GetSurfaceParams().IsBuffer()) {
+ return GetTexture();
+ }
+ const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+ if (current_swizzle == new_swizzle) {
+ return current_view;
+ }
+ current_swizzle = new_swizzle;
+
+ const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+ OGLTextureView& view = entry->second;
+ if (!is_cache_miss) {
+ current_view = view.handle;
+ return view.handle;
+ }
+ view = CreateTextureView();
+ current_view = view.handle;
+
+ std::array swizzle{x_source, y_source, z_source, w_source};
+
+ switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
+ case PixelFormat::D24_UNORM_S8_UINT:
+ case PixelFormat::D32_FLOAT_S8_UINT:
+ case PixelFormat::S8_UINT_D24_UNORM:
+ UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+ glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
GetComponent(format, x_source == SwizzleSource::R));
- break;
- default:
- glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+
+ // Make sure we sample the first component
+ std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+ return value == SwizzleSource::G ? SwizzleSource::R : value;
+ });
+ [[fallthrough]];
+ default: {
+ const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+ GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+ glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
break;
}
+ }
+ return view.handle;
}
OGLTextureView CachedSurfaceView::CreateTextureView() const {
OGLTextureView texture_view;
texture_view.Create();
- glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
- params.num_levels, params.base_layer, params.num_layers);
+ if (target == GL_TEXTURE_3D) {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, 0, 1);
+ } else {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, params.base_layer, params.num_layers);
+ }
ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
return texture_view;
}
-TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
- VideoCore::RasterizerInterface& rasterizer,
- const Device& device, StateTracker& state_tracker)
- : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} {
+TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+ Tegra::Engines::Maxwell3D& maxwell3d,
+ Tegra::MemoryManager& gpu_memory, const Device& device,
+ StateTracker& state_tracker_)
+ : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{
+ state_tracker_} {
src_framebuffer.Create();
dst_framebuffer.Create();
}
@@ -517,8 +571,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
const Tegra::Engines::Fermi2D::Config& copy_config) {
const auto& src_params{src_view->GetSurfaceParams()};
const auto& dst_params{dst_view->GetSurfaceParams()};
- UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
- UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+ UNIMPLEMENTED_IF(src_params.depth != 1);
+ UNIMPLEMENTED_IF(dst_params.depth != 1);
state_tracker.NotifyScissor0();
state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 02d9981a1..7787134fc 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,15 +80,17 @@ public:
explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
~CachedSurfaceView();
- /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
- void Attach(GLenum attachment, GLenum target) const;
+ /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+ /// @param attachment Attachment to bind textures to
+ /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+ void Attach(GLenum attachment, GLenum fb_target) const;
- void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
+ GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
Tegra::Texture::SwizzleSource y_source,
Tegra::Texture::SwizzleSource z_source,
Tegra::Texture::SwizzleSource w_source);
- void DecorateViewName(GPUVAddr gpu_addr, std::string prefix);
+ void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix);
void MarkAsModified(u64 tick) {
surface.MarkAsModified(true, tick);
@@ -98,7 +100,7 @@ public:
if (is_proxy) {
return surface.GetTexture();
}
- return texture_view.handle;
+ return main_view.handle;
}
GLenum GetFormat() const {
@@ -110,29 +112,27 @@ public:
}
private:
- u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source) const {
- return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
- (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
- }
-
OGLTextureView CreateTextureView() const;
CachedSurface& surface;
- GLenum target{};
- GLenum format{};
+ const GLenum format;
+ const GLenum target;
+ const bool is_proxy;
+
+ std::unordered_map<u32, OGLTextureView> view_cache;
+ OGLTextureView main_view;
- OGLTextureView texture_view;
- u32 swizzle{};
- bool is_proxy{};
+ // Use an invalid default so it always fails the comparison test
+ u32 current_swizzle = 0xffffffff;
+ GLuint current_view = 0;
};
class TextureCacheOpenGL final : public TextureCacheBase {
public:
- explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
- const Device& device, StateTracker& state_tracker);
+ explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer,
+ Tegra::Engines::Maxwell3D& maxwell3d,
+ Tegra::MemoryManager& gpu_memory, const Device& device,
+ StateTracker& state_tracker);
~TextureCacheOpenGL();
protected:
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 89f0e04ef..a8be2aa37 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
switch (attrib.type) {
- case Maxwell::VertexAttribute::Type::UnsignedInt:
case Maxwell::VertexAttribute::Type::UnsignedNorm:
+ case Maxwell::VertexAttribute::Type::UnsignedScaled:
+ case Maxwell::VertexAttribute::Type::UnsignedInt:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_8:
case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -47,11 +48,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_UNSIGNED_INT_2_10_10_10_REV;
default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
+ break;
}
- case Maxwell::VertexAttribute::Type::SignedInt:
+ break;
case Maxwell::VertexAttribute::Type::SignedNorm:
+ case Maxwell::VertexAttribute::Type::SignedScaled:
+ case Maxwell::VertexAttribute::Type::SignedInt:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_8:
case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -71,9 +73,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_INT_2_10_10_10_REV;
default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
+ break;
}
+ break;
case Maxwell::VertexAttribute::Type::Float:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_16:
@@ -87,45 +89,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return GL_FLOAT;
default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
- }
- case Maxwell::VertexAttribute::Type::UnsignedScaled:
- switch (attrib.size) {
- case Maxwell::VertexAttribute::Size::Size_8:
- case Maxwell::VertexAttribute::Size::Size_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return GL_UNSIGNED_BYTE;
- case Maxwell::VertexAttribute::Size::Size_16:
- case Maxwell::VertexAttribute::Size::Size_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return GL_UNSIGNED_SHORT;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
- }
- case Maxwell::VertexAttribute::Type::SignedScaled:
- switch (attrib.size) {
- case Maxwell::VertexAttribute::Size::Size_8:
- case Maxwell::VertexAttribute::Size::Size_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return GL_BYTE;
- case Maxwell::VertexAttribute::Size::Size_16:
- case Maxwell::VertexAttribute::Size::Size_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return GL_SHORT;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
+ break;
}
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
- return {};
+ break;
}
+ UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+ attrib.SizeString());
+ return {};
}
inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +107,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
case Maxwell::IndexFormat::UnsignedInt:
return GL_UNSIGNED_INT;
}
- LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
- UNREACHABLE();
+ UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
return {};
}
@@ -180,31 +149,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
}
inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
- Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+ Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
switch (filter_mode) {
- case Tegra::Texture::TextureFilter::Linear: {
- switch (mip_filter_mode) {
+ case Tegra::Texture::TextureFilter::Nearest:
+ switch (mipmap_filter_mode) {
case Tegra::Texture::TextureMipmapFilter::None:
- return GL_LINEAR;
+ return GL_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Nearest:
- return GL_LINEAR_MIPMAP_NEAREST;
+ return GL_NEAREST_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
- return GL_LINEAR_MIPMAP_LINEAR;
+ return GL_NEAREST_MIPMAP_LINEAR;
}
- }
- case Tegra::Texture::TextureFilter::Nearest: {
- switch (mip_filter_mode) {
+ break;
+ case Tegra::Texture::TextureFilter::Linear:
+ switch (mipmap_filter_mode) {
case Tegra::Texture::TextureMipmapFilter::None:
- return GL_NEAREST;
+ return GL_LINEAR;
case Tegra::Texture::TextureMipmapFilter::Nearest:
- return GL_NEAREST_MIPMAP_NEAREST;
+ return GL_LINEAR_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
- return GL_NEAREST_MIPMAP_LINEAR;
+ return GL_LINEAR_MIPMAP_LINEAR;
}
+ break;
}
- }
- LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
- return GL_LINEAR;
+ UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+ static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+ return GL_NEAREST;
}
inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -227,10 +197,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
} else {
return GL_MIRROR_CLAMP_TO_EDGE;
}
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
- return GL_REPEAT;
+ case Tegra::Texture::WrapMode::MirrorOnceClampOGL:
+ if (GL_EXT_texture_mirror_clamp) {
+ return GL_MIRROR_CLAMP_EXT;
+ } else {
+ return GL_MIRROR_CLAMP_TO_EDGE;
+ }
}
+ UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+ return GL_REPEAT;
}
inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -252,8 +227,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
case Tegra::Texture::DepthCompareFunc::Always:
return GL_ALWAYS;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
- static_cast<u32>(func));
+ UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
return GL_GREATER;
}
@@ -275,7 +249,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
case Maxwell::Blend::Equation::MaxGL:
return GL_MAX;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+ UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
return GL_FUNC_ADD;
}
@@ -339,7 +313,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
return GL_ONE_MINUS_CONSTANT_ALPHA;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+ UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
return GL_ZERO;
}
@@ -359,7 +333,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
case Tegra::Texture::SwizzleSource::OneFloat:
return GL_ONE;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+ UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
return GL_ZERO;
}
@@ -390,7 +364,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
case Maxwell::ComparisonOp::AlwaysOld:
return GL_ALWAYS;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+ UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
return GL_ALWAYS;
}
@@ -421,7 +395,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
case Maxwell::StencilOp::DecrWrapOGL:
return GL_DECR_WRAP;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+ UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
return GL_KEEP;
}
@@ -432,7 +406,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
case Maxwell::FrontFace::CounterClockWise:
return GL_CCW;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+ UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
return GL_CCW;
}
@@ -445,7 +419,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
case Maxwell::CullFace::FrontAndBack:
return GL_FRONT_AND_BACK;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+ UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
return GL_BACK;
}
@@ -484,7 +458,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
case Maxwell::LogicOperation::Set:
return GL_SET;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+ UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
return GL_COPY;
}
@@ -501,5 +475,10 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) {
return GL_FILL;
}
+inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) {
+ // Enumeration order matches register order. We can convert it arithmetically.
+ return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle);
+}
+
} // namespace MaxwellToGL
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b2a179746..2ccca1993 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -21,6 +21,8 @@
#include "core/perf_stats.h"
#include "core/settings.h"
#include "core/telemetry_session.h"
+#include "video_core/host_shaders/opengl_present_frag.h"
+#include "video_core/host_shaders/opengl_present_vert.h"
#include "video_core/morton.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
@@ -30,60 +32,6 @@ namespace OpenGL {
namespace {
-constexpr std::size_t SWAP_CHAIN_SIZE = 3;
-
-struct Frame {
- u32 width{}; /// Width of the frame (to detect resize)
- u32 height{}; /// Height of the frame
- bool color_reloaded{}; /// Texture attachment was recreated (ie: resized)
- OpenGL::OGLRenderbuffer color{}; /// Buffer shared between the render/present FBO
- OpenGL::OGLFramebuffer render{}; /// FBO created on the render thread
- OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread
- GLsync render_fence{}; /// Fence created on the render thread
- GLsync present_fence{}; /// Fence created on the presentation thread
- bool is_srgb{}; /// Framebuffer is sRGB or RGB
-};
-
-constexpr char VERTEX_SHADER[] = R"(
-#version 430 core
-
-out gl_PerVertex {
- vec4 gl_Position;
-};
-
-layout (location = 0) in vec2 vert_position;
-layout (location = 1) in vec2 vert_tex_coord;
-layout (location = 0) out vec2 frag_tex_coord;
-
-// This is a truncated 3x3 matrix for 2D transformations:
-// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
-// The third column performs translation.
-// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
-// implicitly be [0, 0, 1]
-layout (location = 0) uniform mat3x2 modelview_matrix;
-
-void main() {
- // Multiply input position by the rotscale part of the matrix and then manually translate by
- // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
- // to `vec3(vert_position.xy, 1.0)`
- gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
- frag_tex_coord = vert_tex_coord;
-}
-)";
-
-constexpr char FRAGMENT_SHADER[] = R"(
-#version 430 core
-
-layout (location = 0) in vec2 frag_tex_coord;
-layout (location = 0) out vec4 color;
-
-layout (binding = 0) uniform sampler2D color_texture;
-
-void main() {
- color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
-}
-)";
-
constexpr GLint PositionLocation = 0;
constexpr GLint TexCoordLocation = 1;
constexpr GLint ModelViewMatrixLocation = 0;
@@ -96,24 +44,6 @@ struct ScreenRectVertex {
std::array<GLfloat, 2> tex_coord;
};
-/// Returns true if any debug tool is attached
-bool HasDebugTool() {
- const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
- if (nsight) {
- return true;
- }
-
- GLint num_extensions;
- glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
- for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
- const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
- if (!std::strcmp(name, "GL_EXT_debug_tool")) {
- return true;
- }
- }
- return false;
-}
-
/**
* Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
* corner and (width, height) on the lower-bottom.
@@ -197,132 +127,15 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
} // Anonymous namespace
-/**
- * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
- * but also make sure that rendering happens at the pace that the frontend dictates. This is a
- * helper class that the renderer uses to sync frames between the render thread and the presentation
- * thread
- */
-class FrameMailbox {
-public:
- std::mutex swap_chain_lock;
- std::condition_variable present_cv;
- std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
- std::queue<Frame*> free_queue;
- std::deque<Frame*> present_queue;
- Frame* previous_frame{};
-
- FrameMailbox() {
- for (auto& frame : swap_chain) {
- free_queue.push(&frame);
- }
- }
-
- ~FrameMailbox() {
- // lock the mutex and clear out the present and free_queues and notify any people who are
- // blocked to prevent deadlock on shutdown
- std::scoped_lock lock{swap_chain_lock};
- std::queue<Frame*>().swap(free_queue);
- present_queue.clear();
- present_cv.notify_all();
- }
-
- void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
- frame->present.Release();
- frame->present.Create();
- GLint previous_draw_fbo{};
- glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
- glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
- glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
- frame->color.handle);
- if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
- LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
- }
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
- frame->color_reloaded = false;
- }
-
- void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
- // Recreate the color texture attachment
- frame->color.Release();
- frame->color.Create();
- const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
- glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
-
- // Recreate the FBO for the render target
- frame->render.Release();
- frame->render.Create();
- glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
- glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
- frame->color.handle);
- if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
- LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
- }
-
- frame->width = width;
- frame->height = height;
- frame->color_reloaded = true;
- }
-
- Frame* GetRenderFrame() {
- std::unique_lock lock{swap_chain_lock};
-
- // If theres no free frames, we will reuse the oldest render frame
- if (free_queue.empty()) {
- auto frame = present_queue.back();
- present_queue.pop_back();
- return frame;
- }
-
- Frame* frame = free_queue.front();
- free_queue.pop();
- return frame;
- }
-
- void ReleaseRenderFrame(Frame* frame) {
- std::unique_lock lock{swap_chain_lock};
- present_queue.push_front(frame);
- present_cv.notify_one();
- }
-
- Frame* TryGetPresentFrame(int timeout_ms) {
- std::unique_lock lock{swap_chain_lock};
- // wait for new entries in the present_queue
- present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
- [&] { return !present_queue.empty(); });
- if (present_queue.empty()) {
- // timed out waiting for a frame to draw so return the previous frame
- return previous_frame;
- }
-
- // free the previous frame and add it back to the free queue
- if (previous_frame) {
- free_queue.push(previous_frame);
- }
-
- // the newest entries are pushed to the front of the queue
- Frame* frame = present_queue.front();
- present_queue.pop_front();
- // remove all old entries from the present queue and move them back to the free_queue
- for (auto f : present_queue) {
- free_queue.push(f);
- }
- present_queue.clear();
- previous_frame = frame;
- return frame;
- }
-};
-
-RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
- Core::Frontend::GraphicsContext& context)
- : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
- has_debug_tool{HasDebugTool()} {}
+RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
+ Core::Frontend::EmuWindow& emu_window_,
+ Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
+ std::unique_ptr<Core::Frontend::GraphicsContext> context)
+ : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_},
+ emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {}
RendererOpenGL::~RendererOpenGL() = default;
-MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64));
-MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128));
-
void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
if (!framebuffer) {
return;
@@ -331,79 +144,34 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
PrepareRendertarget(framebuffer);
RenderScreenshot();
- Frame* frame;
- {
- MICROPROFILE_SCOPE(OpenGL_WaitPresent);
+ glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+ DrawScreen(emu_window.GetFramebufferLayout());
- frame = frame_mailbox->GetRenderFrame();
+ ++m_current_frame;
- // Clean up sync objects before drawing
-
- // INTEL driver workaround. We can't delete the previous render sync object until we are
- // sure that the presentation is done
- if (frame->present_fence) {
- glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
- }
-
- // delete the draw fence if the frame wasn't presented
- if (frame->render_fence) {
- glDeleteSync(frame->render_fence);
- frame->render_fence = 0;
- }
-
- // wait for the presentation to be done
- if (frame->present_fence) {
- glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
- glDeleteSync(frame->present_fence);
- frame->present_fence = 0;
- }
- }
-
- {
- MICROPROFILE_SCOPE(OpenGL_RenderFrame);
- const auto& layout = render_window.GetFramebufferLayout();
-
- // Recreate the frame if the size of the window has changed
- if (layout.width != frame->width || layout.height != frame->height ||
- screen_info.display_srgb != frame->is_srgb) {
- LOG_DEBUG(Render_OpenGL, "Reloading render frame");
- frame->is_srgb = screen_info.display_srgb;
- frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
- }
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle);
- DrawScreen(layout);
- // Create a fence for the frontend to wait on and swap this frame to OffTex
- frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
- glFlush();
- frame_mailbox->ReleaseRenderFrame(frame);
- m_current_frame++;
- rasterizer->TickFrame();
- }
+ rasterizer->TickFrame();
render_window.PollEvents();
- if (has_debug_tool) {
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
- Present(0);
- context.SwapBuffers();
- }
+ context->SwapBuffers();
}
void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
- if (framebuffer) {
- // If framebuffer is provided, reload it from memory to a texture
- if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
- screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
- screen_info.texture.pixel_format != framebuffer->pixel_format ||
- gl_framebuffer_data.empty()) {
- // Reallocate texture if the framebuffer size has changed.
- // This is expected to not happen very often and hence should not be a
- // performance problem.
- ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
- }
-
- // Load the framebuffer from memory, draw it to the screen, and swap buffers
- LoadFBToScreenInfo(*framebuffer);
+ if (!framebuffer) {
+ return;
+ }
+ // If framebuffer is provided, reload it from memory to a texture
+ if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
+ screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
+ screen_info.texture.pixel_format != framebuffer->pixel_format ||
+ gl_framebuffer_data.empty()) {
+ // Reallocate texture if the framebuffer size has changed.
+ // This is expected to not happen very often and hence should not be a
+ // performance problem.
+ ConfigureFramebufferTexture(screen_info.texture, *framebuffer);
}
+
+ // Load the framebuffer from memory, draw it to the screen, and swap buffers
+ LoadFBToScreenInfo(*framebuffer);
}
void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
@@ -423,7 +191,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)};
const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)};
const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel};
- u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)};
+ u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)};
rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes);
// TODO(Rodrigo): Read this from HLE
@@ -453,23 +221,22 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
}
void RendererOpenGL::InitOpenGLObjects() {
- frame_mailbox = std::make_unique<FrameMailbox>();
-
- glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
- 0.0f);
+ glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+ Settings::values.bg_blue.GetValue(), 0.0f);
// Create shader programs
OGLShader vertex_shader;
- vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER);
+ vertex_shader.Create(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER);
OGLShader fragment_shader;
- fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER);
+ fragment_shader.Create(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER);
vertex_program.Create(true, false, vertex_shader.handle);
fragment_program.Create(true, false, fragment_shader.handle);
- // Create program pipeline
- program_manager.Create();
+ pipeline.Create();
+ glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
+ glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
// Generate VBO handle for drawing
vertex_buffer.Create();
@@ -487,6 +254,15 @@ void RendererOpenGL::InitOpenGLObjects() {
// Clear screen to black
LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+ // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+ glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+ glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+ &vertex_buffer_address);
+ }
}
void RendererOpenGL::AddTelemetryFields() {
@@ -498,18 +274,18 @@ void RendererOpenGL::AddTelemetryFields() {
LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
- auto& telemetry_session = system.TelemetrySession();
- telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor);
- telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model);
- telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version);
+ constexpr auto user_system = Common::Telemetry::FieldType::UserSystem;
+ telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor);
+ telemetry_session.AddField(user_system, "GPU_Model", gpu_model);
+ telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version);
}
void RendererOpenGL::CreateRasterizer() {
if (rasterizer) {
return;
}
- rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
- program_manager, state_tracker);
+ rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device,
+ screen_info, program_manager, state_tracker);
}
void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
@@ -525,12 +301,12 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
GLint internal_format;
switch (framebuffer.pixel_format) {
- case Tegra::FramebufferConfig::PixelFormat::ABGR8:
+ case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM:
internal_format = GL_RGBA8;
texture.gl_format = GL_RGBA;
texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV;
break;
- case Tegra::FramebufferConfig::PixelFormat::RGB565:
+ case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM:
internal_format = GL_RGB565;
texture.gl_format = GL_RGB;
texture.gl_type = GL_UNSIGNED_SHORT_5_6_5;
@@ -551,8 +327,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
if (renderer_settings.set_background_color) {
// Update background color before drawing
- glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
- 0.0f);
+ glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+ Settings::values.bg_blue.GetValue(), 0.0f);
}
// Set projection matrix
@@ -620,10 +396,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
state_tracker.NotifyClipControl();
state_tracker.NotifyAlphaTest();
- program_manager.UseVertexShader(vertex_program.handle);
- program_manager.UseGeometryShader(0);
- program_manager.UseFragmentShader(fragment_program.handle);
- program_manager.BindGraphicsPipeline();
+ program_manager.BindHostPipeline(pipeline.handle);
glEnable(GL_CULL_FACE);
if (screen_info.display_srgb) {
@@ -658,58 +431,21 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
offsetof(ScreenRectVertex, tex_coord));
glVertexAttribBinding(PositionLocation, 0);
glVertexAttribBinding(TexCoordLocation, 0);
- glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+ sizeof(vertices));
+ } else {
+ glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+ }
glBindTextureUnit(0, screen_info.display_texture);
glBindSampler(0, 0);
glClear(GL_COLOR_BUFFER_BIT);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-}
-bool RendererOpenGL::TryPresent(int timeout_ms) {
- if (has_debug_tool) {
- LOG_DEBUG(Render_OpenGL,
- "Skipping presentation because we are presenting on the main context");
- return false;
- }
- return Present(timeout_ms);
-}
-
-bool RendererOpenGL::Present(int timeout_ms) {
- const auto& layout = render_window.GetFramebufferLayout();
- auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms);
- if (!frame) {
- LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present");
- return false;
- }
-
- // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a
- // readback since we won't be doing any blending
- glClear(GL_COLOR_BUFFER_BIT);
-
- // Recreate the presentation FBO if the color attachment was changed
- if (frame->color_reloaded) {
- LOG_DEBUG(Render_OpenGL, "Reloading present frame");
- frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height);
- }
- glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED);
- // INTEL workaround.
- // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete
- // it on the emulation thread without too much penalty
- // glDeleteSync(frame.render_sync);
- // frame.render_sync = 0;
-
- glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle);
- glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height,
- GL_COLOR_BUFFER_BIT, GL_LINEAR);
-
- // Insert fence for the main thread to block on
- frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
- glFlush();
-
- glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
- return true;
+ program_manager.RestoreGuestPipeline();
}
void RendererOpenGL::RenderScreenshot() {
@@ -726,7 +462,7 @@ void RendererOpenGL::RenderScreenshot() {
screenshot_framebuffer.Create();
glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle);
- Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
+ const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
GLuint renderbuffer;
glGenRenderbuffers(1, &renderbuffer);
@@ -751,8 +487,9 @@ void RendererOpenGL::RenderScreenshot() {
}
bool RendererOpenGL::Init() {
- if (GLAD_GL_KHR_debug) {
+ if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
+ glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
glDebugMessageCallback(DebugHandler, nullptr);
}
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 50b647661..9ef181f95 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,22 +9,32 @@
#include "common/common_types.h"
#include "common/math_util.h"
#include "video_core/renderer_base.h"
+#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
namespace Core {
class System;
-}
+class TelemetrySession;
+} // namespace Core
namespace Core::Frontend {
class EmuWindow;
}
+namespace Core::Memory {
+class Memory;
+}
+
namespace Layout {
struct FramebufferLayout;
}
+namespace Tegra {
+class GPU;
+}
+
namespace OpenGL {
/// Structure used for storing information about the textures for the Switch screen
@@ -45,24 +55,17 @@ struct ScreenInfo {
TextureInfo texture;
};
-struct PresentationTexture {
- u32 width = 0;
- u32 height = 0;
- OGLTexture texture;
-};
-
-class FrameMailbox;
-
class RendererOpenGL final : public VideoCore::RendererBase {
public:
- explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
- Core::Frontend::GraphicsContext& context);
+ explicit RendererOpenGL(Core::TelemetrySession& telemetry_session,
+ Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory,
+ Tegra::GPU& gpu,
+ std::unique_ptr<Core::Frontend::GraphicsContext> context);
~RendererOpenGL() override;
bool Init() override;
void ShutDown() override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
- bool TryPresent(int timeout_ms) override;
private:
/// Initializes the OpenGL state and creates persistent objects.
@@ -90,37 +93,36 @@ private:
void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer);
- bool Present(int timeout_ms);
-
+ Core::TelemetrySession& telemetry_session;
Core::Frontend::EmuWindow& emu_window;
- Core::System& system;
- Core::Frontend::GraphicsContext& context;
+ Core::Memory::Memory& cpu_memory;
+ Tegra::GPU& gpu;
- StateTracker state_tracker{system};
+ const Device device;
+ StateTracker state_tracker{gpu};
// OpenGL object IDs
OGLBuffer vertex_buffer;
OGLProgram vertex_program;
OGLProgram fragment_program;
+ OGLPipeline pipeline;
OGLFramebuffer screenshot_framebuffer;
+ // GPU address of the vertex buffer
+ GLuint64EXT vertex_buffer_address = 0;
+
/// Display information for Switch screen
ScreenInfo screen_info;
/// Global dummy shader pipeline
- GLShader::ProgramManager program_manager;
+ ProgramManager program_manager;
/// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data;
/// Used for transforming the framebuffer orientation
- Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
+ Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{};
Common::Rectangle<int> framebuffer_crop_rect;
-
- /// Frame presentation mailbox
- std::unique_ptr<FrameMailbox> frame_mailbox;
-
- bool has_debug_tool = false;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index b751086fa..6d7bb16b2 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -14,68 +14,6 @@
namespace OpenGL {
-struct VertexArrayPushBuffer::Entry {
- GLuint binding_index{};
- const GLuint* buffer{};
- GLintptr offset{};
- GLsizei stride{};
-};
-
-VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker)
- : state_tracker{state_tracker} {}
-
-VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
-
-void VertexArrayPushBuffer::Setup() {
- index_buffer = nullptr;
- vertex_buffers.clear();
-}
-
-void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
- index_buffer = buffer;
-}
-
-void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
- GLintptr offset, GLsizei stride) {
- vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
-}
-
-void VertexArrayPushBuffer::Bind() {
- if (index_buffer) {
- state_tracker.BindIndexBuffer(*index_buffer);
- }
-
- for (const auto& entry : vertex_buffers) {
- glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride);
- }
-}
-
-struct BindBuffersRangePushBuffer::Entry {
- GLuint binding;
- const GLuint* buffer;
- GLintptr offset;
- GLsizeiptr size;
-};
-
-BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
-
-BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
-
-void BindBuffersRangePushBuffer::Setup() {
- entries.clear();
-}
-
-void BindBuffersRangePushBuffer::Push(GLuint binding, const GLuint* buffer, GLintptr offset,
- GLsizeiptr size) {
- entries.push_back(Entry{binding, buffer, offset, size});
-}
-
-void BindBuffersRangePushBuffer::Bind() {
- for (const Entry& entry : entries) {
- glBindBufferRange(target, entry.binding, *entry.buffer, entry.offset, entry.size);
- }
-}
-
void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) {
if (!GLAD_GL_KHR_debug) {
// We don't need to throw an error as this is just for debugging
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index 47ee3177b..9c09ee12c 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,49 +11,6 @@
namespace OpenGL {
-class StateTracker;
-
-class VertexArrayPushBuffer final {
-public:
- explicit VertexArrayPushBuffer(StateTracker& state_tracker);
- ~VertexArrayPushBuffer();
-
- void Setup();
-
- void SetIndexBuffer(const GLuint* buffer);
-
- void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
- GLsizei stride);
-
- void Bind();
-
-private:
- struct Entry;
-
- StateTracker& state_tracker;
-
- const GLuint* index_buffer{};
- std::vector<Entry> vertex_buffers;
-};
-
-class BindBuffersRangePushBuffer final {
-public:
- explicit BindBuffersRangePushBuffer(GLenum target);
- ~BindBuffersRangePushBuffer();
-
- void Setup();
-
- void Push(GLuint binding, const GLuint* buffer, GLintptr offset, GLsizeiptr size);
-
- void Bind();
-
-private:
- struct Entry;
-
- GLenum target;
- std::vector<Entry> entries;
-};
-
void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {});
} // namespace OpenGL