diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
35 files changed, 4449 insertions, 1718 deletions
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp new file mode 100644 index 000000000..d6120c23e --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -0,0 +1,2126 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <string> +#include <string_view> +#include <utility> +#include <variant> + +#include <fmt/format.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/shader_ir.h" + +// Predicates in the decompiled code follow the convention that -1 means true and 0 means false. +// GLASM lacks booleans, so they have to be implemented as integers. +// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to +// select between two values, because -1 will be evaluated as true and 0 as false. + +namespace OpenGL { + +namespace { + +using Tegra::Engines::ShaderType; +using Tegra::Shader::Attribute; +using Tegra::Shader::PixelImap; +using Tegra::Shader::Register; +using namespace VideoCommon::Shader; +using Operation = const OperationNode&; + +constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; + +char Swizzle(std::size_t component) { + static constexpr std::string_view SWIZZLE{"xyzw"}; + return SWIZZLE.at(component); +} + +constexpr bool IsGenericAttribute(Attribute::Index index) { + return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; +} + +u32 GetGenericAttributeIndex(Attribute::Index index) { + ASSERT(IsGenericAttribute(index)); + return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0); +} + +std::string_view Modifiers(Operation operation) { + const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta()); + if (meta && meta->precise) { + return ".PREC"; + } + return ""; +} + +std::string_view GetInputFlags(PixelImap attribute) { + switch (attribute) { + case PixelImap::Perspective: + return ""; + case PixelImap::Constant: + return "FLAT "; + case PixelImap::ScreenLinear: + return "NOPERSPECTIVE "; + case PixelImap::Unused: + break; + } + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + return {}; +} + +std::string_view ImageType(Tegra::Shader::ImageType image_type) { + switch (image_type) { + case Tegra::Shader::ImageType::Texture1D: + return "1D"; + case Tegra::Shader::ImageType::TextureBuffer: + return "BUFFER"; + case Tegra::Shader::ImageType::Texture1DArray: + return "ARRAY1D"; + case Tegra::Shader::ImageType::Texture2D: + return "2D"; + case Tegra::Shader::ImageType::Texture2DArray: + return "ARRAY2D"; + case Tegra::Shader::ImageType::Texture3D: + return "3D"; + } + UNREACHABLE(); + return {}; +} + +std::string_view StackName(MetaStackClass stack) { + switch (stack) { + case MetaStackClass::Ssy: + return "SSY"; + case MetaStackClass::Pbk: + return "PBK"; + } + UNREACHABLE(); + return ""; +}; + +std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) { + switch (topology) { + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points: + return "POINTS"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return "LINES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: + return "LINES_ADJACENCY"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + return "TRIANGLES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return "TRIANGLES_ADJACENCY"; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + return "POINTS"; + } +} + +std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { + switch (topology) { + case Tegra::Shader::OutputTopology::PointList: + return "POINTS"; + case Tegra::Shader::OutputTopology::LineStrip: + return "LINE_STRIP"; + case Tegra::Shader::OutputTopology::TriangleStrip: + return "TRIANGLE_STRIP"; + default: + UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + return "points"; + } +} + +std::string_view StageInputName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + case ShaderType::Geometry: + return "vertex"; + case ShaderType::Fragment: + return "fragment"; + case ShaderType::Compute: + return "invocation"; + default: + UNREACHABLE(); + return ""; + } +} + +std::string TextureType(const MetaTexture& meta) { + if (meta.sampler.is_buffer) { + return "BUFFER"; + } + std::string type; + if (meta.sampler.is_shadow) { + type += "SHADOW"; + } + if (meta.sampler.is_array) { + type += "ARRAY"; + } + type += [&meta] { + switch (meta.sampler.type) { + case Tegra::Shader::TextureType::Texture1D: + return "1D"; + case Tegra::Shader::TextureType::Texture2D: + return "2D"; + case Tegra::Shader::TextureType::Texture3D: + return "3D"; + case Tegra::Shader::TextureType::TextureCube: + return "CUBE"; + } + UNREACHABLE(); + return "2D"; + }(); + return type; +} + +class ARBDecompiler final { +public: + explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier); + + std::string Code() const { + return shader_source; + } + +private: + void DefineGlobalMemory(); + + void DeclareHeader(); + void DeclareVertex(); + void DeclareGeometry(); + void DeclareFragment(); + void DeclareCompute(); + void DeclareInputAttributes(); + void DeclareOutputAttributes(); + void DeclareLocalMemory(); + void DeclareGlobalMemory(); + void DeclareConstantBuffers(); + void DeclareRegisters(); + void DeclareTemporaries(); + void DeclarePredicates(); + void DeclareInternalFlags(); + + void InitializeVariables(); + + void DecompileAST(); + void DecompileBranchMode(); + + void VisitAST(const ASTNode& node); + std::string VisitExpression(const Expr& node); + + void VisitBlock(const NodeBlock& bb); + + std::string Visit(const Node& node); + + std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation); + std::string BuildAoffi(Operation); + std::string GlobalMemoryPointer(const GmemNode& gmem); + void Exit(); + + std::string Assign(Operation); + std::string Select(Operation); + std::string FClamp(Operation); + std::string FCastHalf0(Operation); + std::string FCastHalf1(Operation); + std::string FSqrt(Operation); + std::string FSwizzleAdd(Operation); + std::string HAdd2(Operation); + std::string HMul2(Operation); + std::string HFma2(Operation); + std::string HAbsolute(Operation); + std::string HNegate(Operation); + std::string HClamp(Operation); + std::string HCastFloat(Operation); + std::string HUnpack(Operation); + std::string HMergeF32(Operation); + std::string HMergeH0(Operation); + std::string HMergeH1(Operation); + std::string HPack2(Operation); + std::string LogicalAssign(Operation); + std::string LogicalPick2(Operation); + std::string LogicalAnd2(Operation); + std::string FloatOrdered(Operation); + std::string FloatUnordered(Operation); + std::string LogicalAddCarry(Operation); + std::string Texture(Operation); + std::string TextureGather(Operation); + std::string TextureQueryDimensions(Operation); + std::string TextureQueryLod(Operation); + std::string TexelFetch(Operation); + std::string TextureGradient(Operation); + std::string ImageLoad(Operation); + std::string ImageStore(Operation); + std::string Branch(Operation); + std::string BranchIndirect(Operation); + std::string PushFlowStack(Operation); + std::string PopFlowStack(Operation); + std::string Exit(Operation); + std::string Discard(Operation); + std::string EmitVertex(Operation); + std::string EndPrimitive(Operation); + std::string InvocationId(Operation); + std::string YNegate(Operation); + std::string ThreadId(Operation); + std::string ShuffleIndexed(Operation); + std::string Barrier(Operation); + std::string MemoryBarrierGroup(Operation); + std::string MemoryBarrierGlobal(Operation); + + template <const std::string_view& op> + std::string Unary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0])); + return temporary; + } + + template <const std::string_view& op> + std::string Binary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1])); + return temporary; + } + + template <const std::string_view& op> + std::string Trinary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1]), Visit(operation[2])); + return temporary; + } + + template <const std::string_view& op, bool unordered> + std::string FloatComparison(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation)); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NE.x), -1;", temporary); + + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + if constexpr (unordered) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + } else if (op == SNE_F) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + } + return temporary; + } + + template <const std::string_view& op, bool is_nan> + std::string HalfComparison(Operation operation) { + std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + AddLine("UP2H.F {}, {};", tmp1, op_a); + AddLine("UP2H.F {}, {};", tmp2, op_b); + AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2); + AddLine("TRUNC.U.CC RC.xy, {};", tmp1); + AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1); + AddLine("MOV.S {}.x (NE.x), -1;", tmp1); + AddLine("MOV.S {}.y (NE.y), -1;", tmp1); + if constexpr (is_nan) { + AddLine("MOVC.F RC.x, {};", op_a); + AddLine("MOV.S {}.x (NAN.x), -1;", tmp1); + AddLine("MOVC.F RC.x, {};", op_b); + AddLine("MOV.S {}.y (NAN.x), -1;", tmp1); + } + return tmp1; + } + + template <const std::string_view& op, const std::string_view& type> + std::string AtomicImage(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + + AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord, + image_id, ImageType(meta.image.type)); + return fmt::format("{}.x", coord); + } + + template <const std::string_view& op, const std::string_view& type> + std::string Atomic(Operation operation) { + std::string temporary = AllocTemporary(); + std::string address; + std::string_view opname; + bool robust = false; + if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { + address = GlobalMemoryPointer(*gmem); + opname = "ATOM"; + robust = true; + } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { + address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); + opname = "ATOMS"; + } else { + UNREACHABLE(); + return "{0, 0, 0, 0}"; + } + if (robust) { + AddLine("IF NE.x;"); + } + AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + if (robust) { + AddLine("ELSE;"); + AddLine("MOV.S {}, 0;", temporary); + AddLine("ENDIF;"); + } + return temporary; + } + + template <char type> + std::string Negate(Operation operation) { + std::string temporary = AllocTemporary(); + if constexpr (type == 'F') { + AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0])); + } else { + AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0])); + } + return temporary; + } + + template <char type> + std::string Absolute(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0])); + return temporary; + } + + template <char type> + std::string BitfieldInsert(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2])); + AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]), + Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char type> + std::string BitfieldExtract(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1])); + AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char swizzle> + std::string LocalInvocationId(Operation) { + return fmt::format("invocation.localid.{}", swizzle); + } + + template <char swizzle> + std::string WorkGroupId(Operation) { + return fmt::format("invocation.groupid.{}", swizzle); + } + + template <char c1, char c2> + std::string ThreadMask(Operation) { + return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2); + } + + template <typename... Args> + void AddExpression(std::string_view text, Args&&... args) { + shader_source += fmt::format(text, std::forward<Args>(args)...); + } + + template <typename... Args> + void AddLine(std::string_view text, Args&&... args) { + AddExpression(text, std::forward<Args>(args)...); + shader_source += '\n'; + } + + std::string AllocLongVectorTemporary() { + max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1); + return fmt::format("L{}", num_long_temporaries++); + } + + std::string AllocLongTemporary() { + return fmt::format("{}.x", AllocLongVectorTemporary()); + } + + std::string AllocVectorTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}", num_temporaries++); + } + + std::string AllocTemporary() { + return fmt::format("{}.x", AllocVectorTemporary()); + } + + void ResetTemporaries() noexcept { + num_temporaries = 0; + num_long_temporaries = 0; + } + + const Device& device; + const ShaderIR& ir; + const Registry& registry; + const ShaderType stage; + + std::size_t num_temporaries = 0; + std::size_t max_temporaries = 0; + + std::size_t num_long_temporaries = 0; + std::size_t max_long_temporaries = 0; + + std::map<GlobalMemoryBase, u32> global_memory_names; + + std::string shader_source; + + static constexpr std::string_view ADD_F32 = "ADD.F32"; + static constexpr std::string_view ADD_S = "ADD.S"; + static constexpr std::string_view ADD_U = "ADD.U"; + static constexpr std::string_view MUL_F32 = "MUL.F32"; + static constexpr std::string_view MUL_S = "MUL.S"; + static constexpr std::string_view MUL_U = "MUL.U"; + static constexpr std::string_view DIV_F32 = "DIV.F32"; + static constexpr std::string_view DIV_S = "DIV.S"; + static constexpr std::string_view DIV_U = "DIV.U"; + static constexpr std::string_view MAD_F32 = "MAD.F32"; + static constexpr std::string_view RSQ_F32 = "RSQ.F32"; + static constexpr std::string_view COS_F32 = "COS.F32"; + static constexpr std::string_view SIN_F32 = "SIN.F32"; + static constexpr std::string_view EX2_F32 = "EX2.F32"; + static constexpr std::string_view LG2_F32 = "LG2.F32"; + static constexpr std::string_view SLT_F = "SLT.F32"; + static constexpr std::string_view SLT_S = "SLT.S"; + static constexpr std::string_view SLT_U = "SLT.U"; + static constexpr std::string_view SEQ_F = "SEQ.F32"; + static constexpr std::string_view SEQ_S = "SEQ.S"; + static constexpr std::string_view SEQ_U = "SEQ.U"; + static constexpr std::string_view SLE_F = "SLE.F32"; + static constexpr std::string_view SLE_S = "SLE.S"; + static constexpr std::string_view SLE_U = "SLE.U"; + static constexpr std::string_view SGT_F = "SGT.F32"; + static constexpr std::string_view SGT_S = "SGT.S"; + static constexpr std::string_view SGT_U = "SGT.U"; + static constexpr std::string_view SNE_F = "SNE.F32"; + static constexpr std::string_view SNE_S = "SNE.S"; + static constexpr std::string_view SNE_U = "SNE.U"; + static constexpr std::string_view SGE_F = "SGE.F32"; + static constexpr std::string_view SGE_S = "SGE.S"; + static constexpr std::string_view SGE_U = "SGE.U"; + static constexpr std::string_view AND_S = "AND.S"; + static constexpr std::string_view AND_U = "AND.U"; + static constexpr std::string_view TRUNC_F = "TRUNC.F"; + static constexpr std::string_view TRUNC_S = "TRUNC.S"; + static constexpr std::string_view TRUNC_U = "TRUNC.U"; + static constexpr std::string_view SHL_S = "SHL.S"; + static constexpr std::string_view SHL_U = "SHL.U"; + static constexpr std::string_view SHR_S = "SHR.S"; + static constexpr std::string_view SHR_U = "SHR.U"; + static constexpr std::string_view OR_S = "OR.S"; + static constexpr std::string_view OR_U = "OR.U"; + static constexpr std::string_view XOR_S = "XOR.S"; + static constexpr std::string_view XOR_U = "XOR.U"; + static constexpr std::string_view NOT_S = "NOT.S"; + static constexpr std::string_view NOT_U = "NOT.U"; + static constexpr std::string_view BTC_S = "BTC.S"; + static constexpr std::string_view BTC_U = "BTC.U"; + static constexpr std::string_view BTFM_S = "BTFM.S"; + static constexpr std::string_view BTFM_U = "BTFM.U"; + static constexpr std::string_view ROUND_F = "ROUND.F"; + static constexpr std::string_view CEIL_F = "CEIL.F"; + static constexpr std::string_view FLR_F = "FLR.F"; + static constexpr std::string_view I2F_S = "I2F.S"; + static constexpr std::string_view I2F_U = "I2F.U"; + static constexpr std::string_view MIN_F = "MIN.F"; + static constexpr std::string_view MIN_S = "MIN.S"; + static constexpr std::string_view MIN_U = "MIN.U"; + static constexpr std::string_view MAX_F = "MAX.F"; + static constexpr std::string_view MAX_S = "MAX.S"; + static constexpr std::string_view MAX_U = "MAX.U"; + static constexpr std::string_view MOV_U = "MOV.U"; + static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U"; + static constexpr std::string_view TGALL_U = "TGALL.U"; + static constexpr std::string_view TGANY_U = "TGANY.U"; + static constexpr std::string_view TGEQ_U = "TGEQ.U"; + static constexpr std::string_view EXCH = "EXCH"; + static constexpr std::string_view ADD = "ADD"; + static constexpr std::string_view MIN = "MIN"; + static constexpr std::string_view MAX = "MAX"; + static constexpr std::string_view AND = "AND"; + static constexpr std::string_view OR = "OR"; + static constexpr std::string_view XOR = "XOR"; + static constexpr std::string_view U32 = "U32"; + static constexpr std::string_view S32 = "S32"; + + static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount); + using DecompilerType = std::string (ARBDecompiler::*)(Operation); + static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = { + &ARBDecompiler::Assign, + + &ARBDecompiler::Select, + + &ARBDecompiler::Binary<ADD_F32>, + &ARBDecompiler::Binary<MUL_F32>, + &ARBDecompiler::Binary<DIV_F32>, + &ARBDecompiler::Trinary<MAD_F32>, + &ARBDecompiler::Negate<'F'>, + &ARBDecompiler::Absolute<'F'>, + &ARBDecompiler::FClamp, + &ARBDecompiler::FCastHalf0, + &ARBDecompiler::FCastHalf1, + &ARBDecompiler::Binary<MIN_F>, + &ARBDecompiler::Binary<MAX_F>, + &ARBDecompiler::Unary<COS_F32>, + &ARBDecompiler::Unary<SIN_F32>, + &ARBDecompiler::Unary<EX2_F32>, + &ARBDecompiler::Unary<LG2_F32>, + &ARBDecompiler::Unary<RSQ_F32>, + &ARBDecompiler::FSqrt, + &ARBDecompiler::Unary<ROUND_F>, + &ARBDecompiler::Unary<FLR_F>, + &ARBDecompiler::Unary<CEIL_F>, + &ARBDecompiler::Unary<TRUNC_F>, + &ARBDecompiler::Unary<I2F_S>, + &ARBDecompiler::Unary<I2F_U>, + &ARBDecompiler::FSwizzleAdd, + + &ARBDecompiler::Binary<ADD_S>, + &ARBDecompiler::Binary<MUL_S>, + &ARBDecompiler::Binary<DIV_S>, + &ARBDecompiler::Negate<'S'>, + &ARBDecompiler::Absolute<'S'>, + &ARBDecompiler::Binary<MIN_S>, + &ARBDecompiler::Binary<MAX_S>, + + &ARBDecompiler::Unary<TRUNC_S>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_S>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_S>, + &ARBDecompiler::Binary<AND_S>, + &ARBDecompiler::Binary<OR_S>, + &ARBDecompiler::Binary<XOR_S>, + &ARBDecompiler::Unary<NOT_S>, + &ARBDecompiler::BitfieldInsert<'S'>, + &ARBDecompiler::BitfieldExtract<'S'>, + &ARBDecompiler::Unary<BTC_S>, + &ARBDecompiler::Unary<BTFM_S>, + + &ARBDecompiler::Binary<ADD_U>, + &ARBDecompiler::Binary<MUL_U>, + &ARBDecompiler::Binary<DIV_U>, + &ARBDecompiler::Binary<MIN_U>, + &ARBDecompiler::Binary<MAX_U>, + &ARBDecompiler::Unary<TRUNC_U>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::BitfieldInsert<'U'>, + &ARBDecompiler::BitfieldExtract<'U'>, + &ARBDecompiler::Unary<BTC_U>, + &ARBDecompiler::Unary<BTFM_U>, + + &ARBDecompiler::HAdd2, + &ARBDecompiler::HMul2, + &ARBDecompiler::HFma2, + &ARBDecompiler::HAbsolute, + &ARBDecompiler::HNegate, + &ARBDecompiler::HClamp, + &ARBDecompiler::HCastFloat, + &ARBDecompiler::HUnpack, + &ARBDecompiler::HMergeF32, + &ARBDecompiler::HMergeH0, + &ARBDecompiler::HMergeH1, + &ARBDecompiler::HPack2, + + &ARBDecompiler::LogicalAssign, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::LogicalPick2, + &ARBDecompiler::LogicalAnd2, + + &ARBDecompiler::FloatComparison<SLT_F, false>, + &ARBDecompiler::FloatComparison<SEQ_F, false>, + &ARBDecompiler::FloatComparison<SLE_F, false>, + &ARBDecompiler::FloatComparison<SGT_F, false>, + &ARBDecompiler::FloatComparison<SNE_F, false>, + &ARBDecompiler::FloatComparison<SGE_F, false>, + &ARBDecompiler::FloatOrdered, + &ARBDecompiler::FloatUnordered, + &ARBDecompiler::FloatComparison<SLT_F, true>, + &ARBDecompiler::FloatComparison<SEQ_F, true>, + &ARBDecompiler::FloatComparison<SLE_F, true>, + &ARBDecompiler::FloatComparison<SGT_F, true>, + &ARBDecompiler::FloatComparison<SNE_F, true>, + &ARBDecompiler::FloatComparison<SGE_F, true>, + + &ARBDecompiler::Binary<SLT_S>, + &ARBDecompiler::Binary<SEQ_S>, + &ARBDecompiler::Binary<SLE_S>, + &ARBDecompiler::Binary<SGT_S>, + &ARBDecompiler::Binary<SNE_S>, + &ARBDecompiler::Binary<SGE_S>, + + &ARBDecompiler::Binary<SLT_U>, + &ARBDecompiler::Binary<SEQ_U>, + &ARBDecompiler::Binary<SLE_U>, + &ARBDecompiler::Binary<SGT_U>, + &ARBDecompiler::Binary<SNE_U>, + &ARBDecompiler::Binary<SGE_U>, + + &ARBDecompiler::LogicalAddCarry, + + &ARBDecompiler::HalfComparison<SLT_F, false>, + &ARBDecompiler::HalfComparison<SEQ_F, false>, + &ARBDecompiler::HalfComparison<SLE_F, false>, + &ARBDecompiler::HalfComparison<SGT_F, false>, + &ARBDecompiler::HalfComparison<SNE_F, false>, + &ARBDecompiler::HalfComparison<SGE_F, false>, + &ARBDecompiler::HalfComparison<SLT_F, true>, + &ARBDecompiler::HalfComparison<SEQ_F, true>, + &ARBDecompiler::HalfComparison<SLE_F, true>, + &ARBDecompiler::HalfComparison<SGT_F, true>, + &ARBDecompiler::HalfComparison<SNE_F, true>, + &ARBDecompiler::HalfComparison<SGE_F, true>, + + &ARBDecompiler::Texture, + &ARBDecompiler::Texture, + &ARBDecompiler::TextureGather, + &ARBDecompiler::TextureQueryDimensions, + &ARBDecompiler::TextureQueryLod, + &ARBDecompiler::TexelFetch, + &ARBDecompiler::TextureGradient, + + &ARBDecompiler::ImageLoad, + &ARBDecompiler::ImageStore, + + &ARBDecompiler::AtomicImage<ADD, U32>, + &ARBDecompiler::AtomicImage<AND, U32>, + &ARBDecompiler::AtomicImage<OR, U32>, + &ARBDecompiler::AtomicImage<XOR, U32>, + &ARBDecompiler::AtomicImage<EXCH, U32>, + + &ARBDecompiler::Atomic<EXCH, U32>, + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<EXCH, S32>, + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Branch, + &ARBDecompiler::BranchIndirect, + &ARBDecompiler::PushFlowStack, + &ARBDecompiler::PopFlowStack, + &ARBDecompiler::Exit, + &ARBDecompiler::Discard, + + &ARBDecompiler::EmitVertex, + &ARBDecompiler::EndPrimitive, + + &ARBDecompiler::InvocationId, + &ARBDecompiler::YNegate, + &ARBDecompiler::LocalInvocationId<'x'>, + &ARBDecompiler::LocalInvocationId<'y'>, + &ARBDecompiler::LocalInvocationId<'z'>, + &ARBDecompiler::WorkGroupId<'x'>, + &ARBDecompiler::WorkGroupId<'y'>, + &ARBDecompiler::WorkGroupId<'z'>, + + &ARBDecompiler::Unary<TGBALLOT_U>, + &ARBDecompiler::Unary<TGALL_U>, + &ARBDecompiler::Unary<TGANY_U>, + &ARBDecompiler::Unary<TGEQ_U>, + + &ARBDecompiler::ThreadId, + &ARBDecompiler::ThreadMask<'e', 'q'>, + &ARBDecompiler::ThreadMask<'g', 'e'>, + &ARBDecompiler::ThreadMask<'g', 't'>, + &ARBDecompiler::ThreadMask<'l', 'e'>, + &ARBDecompiler::ThreadMask<'l', 't'>, + &ARBDecompiler::ShuffleIndexed, + + &ARBDecompiler::Barrier, + &ARBDecompiler::MemoryBarrierGroup, + &ARBDecompiler::MemoryBarrierGlobal, + }; +}; + +ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier) + : device{device}, ir{ir}, registry{registry}, stage{stage} { + DefineGlobalMemory(); + + AddLine("TEMP RC;"); + AddLine("TEMP FSWZA[4];"); + AddLine("TEMP FSWZB[4];"); + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + AddLine("END"); + + const std::string code = std::move(shader_source); + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareLocalMemory(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareRegisters(); + DeclareTemporaries(); + DeclarePredicates(); + DeclareInternalFlags(); + + shader_source += code; +} + +std::string_view HeaderStageName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + return "vp"; + case ShaderType::Geometry: + return "gp"; + case ShaderType::Fragment: + return "fp"; + case ShaderType::Compute: + return "cp"; + default: + UNREACHABLE(); + return ""; + } +} + +void ARBDecompiler::DefineGlobalMemory() { + u32 binding = 0; + for (const auto& pair : ir.GetGlobalMemory()) { + const GlobalMemoryBase base = pair.first; + global_memory_names.emplace(base, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareHeader() { + AddLine("!!NV{}5.0", HeaderStageName(stage)); + // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D + AddLine("OPTION NV_internal;"); + AddLine("OPTION NV_gpu_program_fp64;"); + AddLine("OPTION NV_shader_thread_group;"); + if (ir.UsesWarps() && device.HasWarpIntrinsics()) { + AddLine("OPTION NV_shader_thread_shuffle;"); + } + if (stage == ShaderType::Vertex) { + if (device.HasNvViewportArray2()) { + AddLine("OPTION NV_viewport_array2;"); + } + } + if (stage == ShaderType::Fragment) { + AddLine("OPTION ARB_draw_buffers;"); + } + if (device.HasImageLoadFormatted()) { + AddLine("OPTION EXT_shader_image_load_formatted;"); + } +} + +void ARBDecompiler::DeclareVertex() { + if (stage != ShaderType::Vertex) { + return; + } + AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};"); +} + +void ARBDecompiler::DeclareGeometry() { + if (stage != ShaderType::Geometry) { + return; + } + const auto& info = registry.GetGraphicsInfo(); + const auto& header = ir.GetHeader(); + AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology)); + AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology)); + AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value()); + AddLine("ATTRIB vertex_position = vertex.position;"); +} + +void ARBDecompiler::DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + AddLine("OUTPUT result_color7 = result.color[7];"); + AddLine("OUTPUT result_color6 = result.color[6];"); + AddLine("OUTPUT result_color5 = result.color[5];"); + AddLine("OUTPUT result_color4 = result.color[4];"); + AddLine("OUTPUT result_color3 = result.color[3];"); + AddLine("OUTPUT result_color2 = result.color[2];"); + AddLine("OUTPUT result_color1 = result.color[1];"); + AddLine("OUTPUT result_color0 = result.color;"); +} + +void ARBDecompiler::DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const ComputeInfo& info = registry.GetComputeInfo(); + AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1], + info.workgroup_size[2]); + if (info.shared_memory_size_in_words == 0) { + return; + } + const u32 limit = device.GetMaxComputeSharedMemorySize(); + u32 size_in_bytes = info.shared_memory_size_in_words * 4; + if (size_in_bytes > limit) { + LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}", + size_in_bytes, limit); + size_in_bytes = limit; + } + + AddLine("SHARED_MEMORY {};", size_in_bytes); + AddLine("SHARED shared_mem[] = {{program.sharedmem}};"); +} + +void ARBDecompiler::DeclareInputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + const std::string_view stage_name = StageInputName(stage); + for (const auto attribute : ir.GetInputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + + std::string_view suffix; + if (stage == ShaderType::Fragment) { + const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)}; + if (input_mode == PixelImap::Unused) { + return; + } + suffix = GetInputFlags(input_mode); + } + AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index, + index); + } +} + +void ARBDecompiler::DeclareOutputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index); + } +} + +void ARBDecompiler::DeclareLocalMemory() { + u64 size = 0; + if (stage == ShaderType::Compute) { + size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + size = ir.GetHeader().GetLocalMemorySize(); + } + if (size == 0) { + return; + } + const u64 element_count = Common::AlignUp(size, 4) / 4; + AddLine("TEMP lmem[{}];", element_count); +} + +void ARBDecompiler::DeclareGlobalMemory() { + const size_t num_entries = ir.GetGlobalMemory().size(); + if (num_entries > 0) { + AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1); + } +} + +void ARBDecompiler::DeclareConstantBuffers() { + u32 binding = 0; + for (const auto& cbuf : ir.GetConstantBuffers()) { + AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareRegisters() { + for (const u32 gpr : ir.GetRegisters()) { + AddLine("TEMP R{};", gpr); + } +} + +void ARBDecompiler::DeclareTemporaries() { + for (std::size_t i = 0; i < max_temporaries; ++i) { + AddLine("TEMP T{};", i); + } + for (std::size_t i = 0; i < max_long_temporaries; ++i) { + AddLine("LONG TEMP L{};", i); + } +} + +void ARBDecompiler::DeclarePredicates() { + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("TEMP P{};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DeclareInternalFlags() { + for (const char* name : INTERNAL_FLAG_NAMES) { + AddLine("TEMP {};", name); + } +} + +void ARBDecompiler::InitializeVariables() { + AddLine("MOV.F32 FSWZA[0], -1;"); + AddLine("MOV.F32 FSWZA[1], 1;"); + AddLine("MOV.F32 FSWZA[2], -1;"); + AddLine("MOV.F32 FSWZA[3], 0;"); + AddLine("MOV.F32 FSWZB[0], -1;"); + AddLine("MOV.F32 FSWZB[1], -1;"); + AddLine("MOV.F32 FSWZB[2], 1;"); + AddLine("MOV.F32 FSWZB[3], -1;"); + + if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) { + AddLine("MOV.F result.position, {{0, 0, 0, 1}};"); + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index); + } + for (const u32 gpr : ir.GetRegisters()) { + AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr); + } + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DecompileAST() { + const u32 num_flow_variables = ir.GetASTNumVariables(); + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("TEMP F{};", i); + } + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i); + } + + InitializeVariables(); + + VisitAST(ir.GetASTProgram()); +} + +void ARBDecompiler::DecompileBranchMode() { + static constexpr u32 FLOW_STACK_SIZE = 20; + if (!ir.IsFlowStackDisabled()) { + AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE); + AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE); + AddLine("TEMP SSY_TOP;"); + AddLine("TEMP PBK_TOP;"); + } + + AddLine("TEMP PC;"); + + if (!ir.IsFlowStackDisabled()) { + AddLine("MOV.U SSY_TOP.x, 0;"); + AddLine("MOV.U PBK_TOP.x, 0;"); + } + + InitializeVariables(); + + const auto basic_block_end = ir.GetBasicBlocks().end(); + auto basic_block_it = ir.GetBasicBlocks().begin(); + const u32 first_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", first_address); + + AddLine("REP;"); + + std::size_t num_blocks = 0; + while (basic_block_it != basic_block_end) { + const auto& [address, bb] = *basic_block_it; + ++num_blocks; + + AddLine("SEQ.S.CC RC.x, PC.x, {};", address); + AddLine("IF NE.x;"); + + VisitBlock(bb); + + ++basic_block_it; + + if (basic_block_it != basic_block_end) { + const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]); + if (!op || op->GetCode() != OperationCode::Branch) { + const u32 next_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", next_address); + AddLine("CONT;"); + } + } + + AddLine("ELSE;"); + } + AddLine("RET;"); + while (num_blocks--) { + AddLine("ENDIF;"); + } + + AddLine("ENDREP;"); +} + +void ARBDecompiler::VisitAST(const ASTNode& node) { + if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) { + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + + AddLine("MOVC.U RC.x, {};", condition); + AddLine("IF NE.x;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("ENDIF;"); + } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) { + AddLine("ELSE;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { + VisitBlock(ast->nodes); + } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + ResetTemporaries(); + } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + AddLine("REP;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("MOVC.U RC.x, {};", condition); + AddLine("BRK (NE.x);"); + AddLine("ENDREP;"); + } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast->condition); + if (!is_true) { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("IF NE.x;"); + ResetTemporaries(); + } + if (ast->kills) { + AddLine("KIL TR;"); + } else { + Exit(); + } + if (!is_true) { + AddLine("ENDIF;"); + } + } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) { + if (ExprIsTrue(ast->condition)) { + AddLine("BRK;"); + } else { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("BRK (NE.x);"); + ResetTemporaries(); + } + } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) { + // Nothing to do + } else { + UNREACHABLE(); + } +} + +std::string ARBDecompiler::VisitExpression(const Expr& node) { + if (const auto expr = std::get_if<ExprAnd>(&*node)) { + std::string result = AllocTemporary(); + AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprOr>(&*node)) { + std::string result = AllocTemporary(); + AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprNot>(&*node)) { + std::string result = AllocTemporary(); + AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1)); + return result; + } + if (const auto expr = std::get_if<ExprPredicate>(&*node)) { + return fmt::format("P{}.x", static_cast<u64>(expr->predicate)); + } + if (const auto expr = std::get_if<ExprCondCode>(&*node)) { + return Visit(ir.GetConditionCode(expr->cc)); + } + if (const auto expr = std::get_if<ExprVar>(&*node)) { + return fmt::format("F{}.x", expr->var_index); + } + if (const auto expr = std::get_if<ExprBoolean>(&*node)) { + return expr->value ? "0xffffffff" : "0"; + } + if (const auto expr = std::get_if<ExprGprEqual>(&*node)) { + std::string result = AllocTemporary(); + AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value); + return result; + } + UNREACHABLE(); + return "0"; +} + +void ARBDecompiler::VisitBlock(const NodeBlock& bb) { + for (const auto& node : bb) { + Visit(node); + } +} + +std::string ARBDecompiler::Visit(const Node& node) { + if (const auto operation = std::get_if<OperationNode>(&*node)) { + if (const auto amend_index = operation->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + const std::size_t index = static_cast<std::size_t>(operation->GetCode()); + if (index >= OPERATION_DECOMPILERS.size()) { + UNREACHABLE_MSG("Out of bounds operation: {}", index); + return {}; + } + const auto decompiler = OPERATION_DECOMPILERS[index]; + if (decompiler == nullptr) { + UNREACHABLE_MSG("Undefined operation: {}", index); + return {}; + } + return (this->*decompiler)(*operation); + } + + if (const auto gpr = std::get_if<GprNode>(&*node)) { + const u32 index = gpr->GetIndex(); + if (index == Register::ZeroIndex) { + return "{0, 0, 0, 0}.x"; + } + return fmt::format("R{}.x", index); + } + + if (const auto cv = std::get_if<CustomVarNode>(&*node)) { + return fmt::format("CV{}.x", cv->GetIndex()); + } + + if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("MOV.U {}, {};", temporary, immediate->GetValue()); + return temporary; + } + + if (const auto predicate = std::get_if<PredicateNode>(&*node)) { + std::string temporary = AllocTemporary(); + switch (const auto index = predicate->GetIndex(); index) { + case Tegra::Shader::Pred::UnusedIndex: + AddLine("MOV.S {}, -1;", temporary); + break; + case Tegra::Shader::Pred::NeverExecute: + AddLine("MOV.S {}, 0;", temporary); + break; + default: + AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index)); + break; + } + if (predicate->IsNegated()) { + AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary); + } + return temporary; + } + + if (const auto abuf = std::get_if<AbufNode>(&*node)) { + if (abuf->IsPhysicalBuffer()) { + UNIMPLEMENTED_MSG("Physical buffers are not implemented"); + return "{0, 0, 0, 0}.x"; + } + + const Attribute::Index index = abuf->GetIndex(); + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (index) { + case Attribute::Index::Position: { + if (stage == ShaderType::Geometry) { + return fmt::format("{}_position[{}].{}", StageInputName(stage), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.position.{}", StageInputName(stage), swizzle); + } + } + case Attribute::Index::TessCoordInstanceIDVertexID: + ASSERT(stage == ShaderType::Vertex); + switch (element) { + case 2: + return "vertex.instance"; + case 3: + return "vertex.id"; + } + UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); + break; + case Attribute::Index::PointCoord: + switch (element) { + case 0: + return "fragment.pointcoord.x"; + case 1: + return "fragment.pointcoord.y"; + } + UNIMPLEMENTED(); + break; + case Attribute::Index::FrontFacing: { + ASSERT(stage == ShaderType::Fragment); + ASSERT(element == 3); + const std::string temporary = AllocVectorTemporary(); + AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};"); + AddLine("MOV.U.CC RC.x, -RC;"); + AddLine("MOV.S {}.x, 0;", temporary); + AddLine("MOV.S {}.x (NE.x), -1;", temporary); + return fmt::format("{}.x", temporary); + } + default: + if (IsGenericAttribute(index)) { + if (stage == ShaderType::Geometry) { + return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.attrib[{}].{}", StageInputName(stage), + GetGenericAttributeIndex(index), swizzle); + } + } + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index)); + break; + } + return "{0, 0, 0, 0}.x"; + } + + if (const auto cbuf = std::get_if<CbufNode>(&*node)) { + std::string offset_string; + const auto& offset = cbuf->GetOffset(); + if (const auto imm = std::get_if<ImmediateNode>(&*offset)) { + offset_string = std::to_string(imm->GetValue()); + } else { + offset_string = Visit(offset); + } + std::string temporary = AllocTemporary(); + AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string); + return temporary; + } + + if (const auto gmem = std::get_if<GmemNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("MOV {}, 0;", temporary); + AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem)); + return temporary; + } + + if (const auto lmem = std::get_if<LmemNode>(&*node)) { + std::string temporary = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", temporary, temporary); + AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary); + return temporary; + } + + if (const auto smem = std::get_if<SmemNode>(&*node)) { + std::string temporary = Visit(smem->GetAddress()); + AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary); + return temporary; + } + + if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } + + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + if (const auto amend_index = conditional->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition())); + AddLine("IF NE.x;"); + VisitBlock(conditional->GetCode()); + AddLine("ENDIF;"); + return {}; + } + + if ([[maybe_unused]] const auto cmt = std::get_if<CommentNode>(&*node)) { + // Uncommenting this will generate invalid code. GLASM lacks comments. + // AddLine("// {}", cmt->GetText()); + return {}; + } + + UNIMPLEMENTED(); + return {}; +} + +std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + UNIMPLEMENTED_IF(meta.sampler.is_indexed); + + const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube; + const std::size_t count = operation.GetOperandsCount(); + std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + if (meta.sampler.is_array) { + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array)); + ++i; + } + if (meta.sampler.is_shadow) { + std::string compare = Visit(meta.depth_compare); + if (is_extended) { + ASSERT(i == 4); + std::string extra_coord = AllocVectorTemporary(); + AddLine("MOV.F {}.x, {};", extra_coord, compare); + return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0}; + } + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare); + ++i; + } + return {temporary, temporary, i}; +} + +std::string ARBDecompiler::BuildAoffi(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + if (meta.aoffi.empty()) { + return {}; + } + const std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (auto& node : meta.aoffi) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node)); + } + return fmt::format(", offset({})", temporary); +} + +std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) { + // Read a bindless SSBO, return its address and set CC accordingly + // address = c[binding].xy + // length = c[binding].z + const u32 binding = global_memory_names.at(gmem.GetDescriptor()); + + const std::string pointer = AllocLongVectorTemporary(); + std::string temporary = AllocTemporary(); + + AddLine("PK64.U {}, c[{}];", pointer, binding); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()), + Visit(gmem.GetBaseAddress())); + AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary); + AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer); + // Compare offset to length and set CC + AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding); + return fmt::format("{}.x", pointer); +} + +void ARBDecompiler::Exit() { + if (stage != ShaderType::Fragment) { + AddLine("RET;"); + return; + } + + const auto safe_get_register = [this](u32 reg) -> std::string { + // TODO(Rodrigo): Replace with contains once C++20 releases + const auto& used_registers = ir.GetRegisters(); + if (used_registers.find(reg) != used_registers.end()) { + return fmt::format("R{}.x", reg); + } + return "{0, 0, 0, 0}.x"; + }; + + const auto& header = ir.GetHeader(); + u32 current_reg = 0; + for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) { + for (u32 component = 0; component < 4; ++component) { + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; + } + AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component), + safe_get_register(current_reg)); + ++current_reg; + } + } + if (header.ps.omap.depth) { + AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1)); + } + + AddLine("RET;"); +} + +std::string ARBDecompiler::Assign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string dest_name; + if (const auto gpr = std::get_if<GprNode>(&*dest)) { + if (gpr->GetIndex() == Register::ZeroIndex) { + // Writing to Register::ZeroIndex is a no op + return {}; + } + dest_name = fmt::format("R{}.x", gpr->GetIndex()); + } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (const Attribute::Index index = abuf->GetIndex()) { + case Attribute::Index::Position: + dest_name = fmt::format("result.position.{}", swizzle); + break; + case Attribute::Index::LayerViewportPointSize: + switch (element) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + case 2: + if (!device.HasNvViewportArray2()) { + LOG_ERROR( + Render_OpenGL, + "NV_viewport_array2 is missing. Maxwell gen 2 or better is required."); + return {}; + } + dest_name = element == 1 ? "result.layer.x" : "result.viewport.x"; + break; + case 3: + dest_name = "result.pointsize.x"; + break; + } + break; + case Attribute::Index::ClipDistances0123: + dest_name = fmt::format("result.clip[{}].x", element); + break; + case Attribute::Index::ClipDistances4567: + dest_name = fmt::format("result.clip[{}].x", element + 4); + break; + default: + if (!IsGenericAttribute(index)) { + UNREACHABLE(); + return {}; + } + dest_name = + fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle); + break; + } + } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { + const std::string address = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", address, address); + dest_name = fmt::format("lmem[{}].x", address); + } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { + AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress())); + ResetTemporaries(); + return {}; + } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { + AddLine("IF NE.x;"); + AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem)); + AddLine("ENDIF;"); + ResetTemporaries(); + return {}; + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", dest_name, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::Select(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]), + Visit(operation[2])); + return temporary; +} + +std::string ARBDecompiler::FClamp(Operation operation) { + // 1.0f in hex, replace with std::bit_cast on C++20 + static constexpr u32 POSITIVE_ONE = 0x3f800000; + + std::string temporary = AllocTemporary(); + const Node& value = operation[0]; + const Node& low = operation[1]; + const Node& high = operation[2]; + const auto* const imm_low = std::get_if<ImmediateNode>(&*low); + const auto* const imm_high = std::get_if<ImmediateNode>(&*high); + if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) { + AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value)); + } else { + AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high)); + AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low)); + } + return temporary; +} + +std::string ARBDecompiler::FCastHalf0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FCastHalf1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0])); + AddLine("MOV {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FSqrt(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0])); + AddLine("RCP.F32 {}, {};", temporary, temporary); + return temporary; +} + +std::string ARBDecompiler::FSwizzleAdd(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); + return fmt::format("{}.x", temporary); + } + + AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage)); + AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary); + AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary); + AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary); + AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary); + AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary); + AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HAdd2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HMul2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HFma2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string tmp3 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2])); + AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HAbsolute(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, |{}|;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HNegate(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("MOVC.S RC.x, {};", Visit(operation[1])); + AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary); + AddLine("MOVC.S RC.x, {};", Visit(operation[2])); + AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HClamp(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HCastFloat(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary); + AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HUnpack(Operation operation) { + std::string operand = Visit(operation[0]); + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H0_H0: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H1_H1: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + } + UNREACHABLE(); + return "{0, 0, 0, 0}.x"; +} + +std::string ARBDecompiler::HMergeF32(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.x, {}.z;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.w;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HPack2(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0])); + AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::LogicalAssign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string target; + + if (const auto pred = std::get_if<PredicateNode>(&*dest)) { + ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment"); + + const Tegra::Shader::Pred index = pred->GetIndex(); + switch (index) { + case Tegra::Shader::Pred::NeverExecute: + case Tegra::Shader::Pred::UnusedIndex: + // Writing to these predicates is a no-op + return {}; + } + target = fmt::format("P{}.x", static_cast<u64>(index)); + } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", target, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::LogicalPick2(Operation operation) { + std::string temporary = AllocTemporary(); + const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue(); + AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index)); + return temporary; +} + +std::string ARBDecompiler::LogicalAnd2(Operation operation) { + std::string temporary = AllocTemporary(); + const std::string op = Visit(operation[0]); + AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op); + return temporary; +} + +std::string ARBDecompiler::FloatOrdered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, -1;", temporary); + AddLine("MOV.S {} (NAN.x), 0;", temporary); + AddLine("MOV.S {} (NAN.y), 0;", temporary); + return temporary; +} + +std::string ARBDecompiler::FloatUnordered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NAN.x), -1;", temporary); + AddLine("MOV.S {} (NAN.y), -1;", temporary); + return temporary; +} + +std::string ARBDecompiler::LogicalAddCarry(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("IF CF.x;"); + AddLine("MOV.S {}, -1;", temporary); + AddLine("ENDIF;"); + return temporary; +} + +std::string ARBDecompiler::Texture(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + std::string_view opcode = "TEX"; + std::string extra; + if (meta.bias) { + ASSERT(!meta.lod); + opcode = "TXB"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias)); + } else { + const std::string bias = AllocTemporary(); + AddLine("MOV.F {}, {};", bias, Visit(meta.bias)); + extra = fmt::format(" {},", bias); + } + } + if (meta.lod) { + ASSERT(!meta.bias); + opcode = "TXL"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } else { + const std::string lod = AllocTemporary(); + AddLine("MOV.F {}, {};", lod, Visit(meta.lod)); + extra = fmt::format(" {},", lod); + } + } + + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGather(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + std::string comp; + if (!meta.sampler.is_shadow) { + const auto& immediate = std::get<ImmediateNode>(*meta.component); + comp = fmt::format(".{}", Swizzle(immediate.GetValue())); + } + + AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryDimensions(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0"; + AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryLod(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::size_t count = operation.GetOperandsCount(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta)); + AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary); + AddLine("TRUNC.S {}, {};", temporary, temporary); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TexelFetch(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + if (!meta.sampler.is_buffer) { + ASSERT(swizzle < 4); + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta), + BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGradient(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const std::string ddx = AllocVectorTemporary(); + const std::string ddy = AllocVectorTemporary(); + const std::string coord = std::get<1>(BuildCoords(operation)); + + const std::size_t num_components = meta.derivates.size() / 2; + for (std::size_t index = 0; index < num_components; ++index) { + const char swizzle = Swizzle(index); + AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2])); + AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1])); + } + + const std::string_view result = coord; + AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element)); + return fmt::format("{}.x", result); +} + +std::string ARBDecompiler::ImageLoad(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t count = operation.GetOperandsCount(); + const std::string_view type = ImageType(meta.image.type); + + const std::string temporary = AllocVectorTemporary(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type); + AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::ImageStore(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + const std::string_view type = ImageType(meta.image.type); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type); + return {}; +} + +std::string ARBDecompiler::Branch(Operation operation) { + const auto target = std::get<ImmediateNode>(*operation[0]); + AddLine("MOV.U PC.x, {};", target.GetValue()); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::BranchIndirect(Operation operation) { + AddLine("MOV.U PC.x, {};", Visit(operation[0])); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::PushFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue(); + const std::string_view stack_name = StackName(stack); + AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target); + AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + return {}; +} + +std::string ARBDecompiler::PopFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const std::string_view stack_name = StackName(stack); + AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::Exit(Operation) { + Exit(); + return {}; +} + +std::string ARBDecompiler::Discard(Operation) { + AddLine("KIL TR;"); + return {}; +} + +std::string ARBDecompiler::EmitVertex(Operation) { + AddLine("EMIT;"); + return {}; +} + +std::string ARBDecompiler::EndPrimitive(Operation) { + AddLine("ENDPRIM;"); + return {}; +} + +std::string ARBDecompiler::InvocationId(Operation) { + return "primitive.invocation"; +} + +std::string ARBDecompiler::YNegate(Operation) { + LOG_WARNING(Render_OpenGL, "(STUBBED)"); + std::string temporary = AllocTemporary(); + AddLine("MOV.F {}, 1;", temporary); + return temporary; +} + +std::string ARBDecompiler::ThreadId(Operation) { + return fmt::format("{}.threadid", StageInputName(stage)); +} + +std::string ARBDecompiler::ShuffleIndexed(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + return Visit(operation[0]); + } + const std::string temporary = AllocVectorTemporary(); + AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]), + Visit(operation[1])); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::Barrier(Operation) { + AddLine("BAR;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGroup(Operation) { + AddLine("MEMBAR.CTA;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGlobal(Operation) { + AddLine("MEMBAR;"); + return {}; +} + +} // Anonymous namespace + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier) { + return ARBDecompiler(device, ir, registry, stage, identifier).Code(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h new file mode 100644 index 000000000..6afc87220 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> +#include <string_view> + +#include "common/common_types.h" + +namespace Tegra::Engines { +enum class ShaderType : u32; +} + +namespace VideoCommon::Shader { +class ShaderIR; +class Registry; +} // namespace VideoCommon::Shader + +namespace OpenGL { + +class Device; + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 4eb37a96c..b1c4cd62f 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -8,6 +8,7 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" @@ -21,22 +22,54 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) +Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); + } } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; + +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { + glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), + data); +} -OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device, std::size_t stream_size) - : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { + MICROPROFILE_SCOPE(OpenGL_Buffer_Download); + const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); + const GLintptr gl_offset = static_cast<GLintptr>(offset); + if (read_buffer.handle == 0) { + read_buffer.Create(); + glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, + GL_STREAM_READ); + } + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); + glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); +} + +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) { + glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), + static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +} + +OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, + const Device& device_, std::size_t stream_size) + : GenericBufferCache{rasterizer, gpu_memory, cpu_memory, + std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, + device{device_} { if (!device.HasFastBufferSubData()) { return; } - static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); + static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); for (const GLuint cbuf : cbufs) { glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); @@ -47,49 +80,21 @@ OGLBufferCache::~OGLBufferCache() { glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); } -Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<CachedBufferBlock>(cpu_addr, size); -} - -void OGLBufferCache::WriteBarrier() { - glMemoryBarrier(GL_ALL_BARRIER_BITS); +std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared<Buffer>(device, cpu_addr, size); } -const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); -} - -const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { - static const GLuint null_buffer = 0; - return &null_buffer; -} - -void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { - glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { - glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(), - static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset), - static_cast<GLsizeiptr>(size)); +OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { + return {0, 0, 0}; } OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint& cbuf = cbufs[cbuf_cursor++]; + const GLuint cbuf = cbufs[cbuf_cursor++]; + glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); - return {&cbuf, 0}; + return {cbuf, 0, 0}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index d94a11252..f75b32e31 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -24,59 +23,59 @@ class Device; class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferBlock; +class Buffer : public VideoCommon::BufferBlock { +public: + explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); + ~Buffer(); -using Buffer = std::shared_ptr<CachedBufferBlock>; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; + void Upload(std::size_t offset, std::size_t size, const u8* data); -class CachedBufferBlock : public VideoCommon::BufferBlock { -public: - explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); - ~CachedBufferBlock(); + void Download(std::size_t offset, std::size_t size, u8* data); + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size); - const GLuint* GetHandle() const { - return &gl_buffer.handle; + GLuint Handle() const noexcept { + return gl_buffer.handle; + } + + u64 Address() const noexcept { + return gpu_address; } private: - OGLBuffer gl_buffer{}; + OGLBuffer gl_buffer; + OGLBuffer read_buffer; + u64 gpu_address = 0; }; +using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; class OGLBufferCache final : public GenericBufferCache { public: - explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, const Device& device, std::size_t stream_size); ~OGLBufferCache(); - const GLuint* GetEmptyBuffer(std::size_t) override; + BufferInfo GetEmptyBuffer(std::size_t) override; void Acquire() noexcept { cbuf_cursor = 0; } protected: - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - void WriteBarrier() override; - - const GLuint* ToHandle(const Buffer& buffer) override; - - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; + std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; private: + static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + + const Device& device; + std::size_t cbuf_cursor = 0; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> - cbufs; + std::array<GLuint, NUM_CBUFS> cbufs{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index c286502ba..a94e4f72e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -6,6 +6,7 @@ #include <array> #include <cstddef> #include <cstring> +#include <limits> #include <optional> #include <vector> @@ -13,6 +14,7 @@ #include "common/logging/log.h" #include "common/scope_exit.h" +#include "core/settings.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, - GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, - GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; +constexpr std::array LimitUBOs = { + GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, + GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; constexpr std::array LimitSSBOs = { - GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, + GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; -constexpr std::array LimitSamplers = { - GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS}; +constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; -constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, - GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, - GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, - GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; +constexpr std::array LimitImages = { + GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, + GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; template <typename T> T GetInteger(GLenum pname) { @@ -84,10 +89,17 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { return std::exchange(base, base + amount); } +std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { + std::array<u32, Tegra::Engines::MaxShaderTypes> max; + std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), + [](GLenum pname) { return GetInteger<u32>(pname); }); + return max; +} + std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; - static std::array<std::size_t, 5> stage_swizzle = {0, 1, 2, 3, 4}; + static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4}; const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS); const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS); const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS); @@ -111,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS); u32 base_images = 0; - // Reserve more image bindings on fragment and vertex stages. + // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8. + // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the + // fragment stage, and at least 1 for the rest of the stages. + // So far games are observed to use 1 image binding on vertex and 4 on fragment stages. + + // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]); - bindings[0].image = - Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + + // This is guaranteed to be at least 1. + const u32 total_extracted_images = num_images / (NumStages - 1); // Reserve the other image bindings. - const u32 total_extracted_images = num_images / (NumStages - 2); - for (std::size_t i = 2; i < NumStages; ++i) { + for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; + if (stage == 4) { + continue; + } bindings[stage].image = Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); } @@ -132,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin } bool IsASTCSupported() { + static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY}; static constexpr std::array formats = { GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, @@ -148,59 +169,94 @@ bool IsASTCSupported() { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, }; - return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) { - GLint supported; - glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1, - &supported); - return supported == GL_TRUE; - }) == formats.end(); + static constexpr std::array required_support = { + GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE, + GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE, + }; + + for (const GLenum target : targets) { + for (const GLenum format : formats) { + for (const GLenum support : required_support) { + GLint value; + glGetInternalformativ(target, format, support, 1, &value); + if (value != GL_FULL_SUPPORT) { + return false; + } + } + } + } + return true; } } // Anonymous namespace -Device::Device() : base_bindings{BuildBaseBindings()} { +Device::Device() + : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); - const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); + const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; - const bool is_intel = vendor == "Intel"; - const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr; + + bool disable_fast_buffer_sub_data = false; + if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { + LOG_WARNING( + Render_OpenGL, + "Beta driver 443.24 is known to have issues. There might be performance issues."); + disable_fast_buffer_sub_data = true; + } uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); + max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE); has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group && GLAD_GL_NV_shader_thread_shuffle; has_shader_ballot = GLAD_GL_ARB_shader_ballot; has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); + has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod"); has_astc = IsASTCSupported(); has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_broken_compute = is_intel_proprietary; - has_fast_buffer_sub_data = is_nvidia; + has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; + has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + + // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive + // uniform buffers as "push constants" + has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; + + use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() && + GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && + GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; + + use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + + if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) { + LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); + } } Device::Device(std::nullptr_t) { - uniform_buffer_alignment = 0; + max_uniform_buffers.fill(std::numeric_limits<u32>::max()); + uniform_buffer_alignment = 4; + shader_storage_alignment = 4; max_vertex_attributes = 16; max_varyings = 15; + max_compute_shared_memory_size = 0x10000; has_warp_intrinsics = true; has_shader_ballot = true; has_vertex_viewport_layer = true; has_image_load_formatted = true; + has_texture_shadow_lod = true; has_variable_aoffi = true; - has_component_indexing_bug = false; - has_broken_compute = false; - has_precise_bug = false; } bool Device::TestVariableAoffi() { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index a55050cb5..8a4b6b9fc 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -24,6 +24,10 @@ public: explicit Device(); explicit Device(std::nullptr_t); + u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { + return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; + } + const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { return base_bindings[stage_index]; } @@ -48,6 +52,10 @@ public: return max_varyings; } + u32 GetMaxComputeSharedMemorySize() const { + return max_compute_shared_memory_size; + } + bool HasWarpIntrinsics() const { return has_warp_intrinsics; } @@ -64,6 +72,14 @@ public: return has_image_load_formatted; } + bool HasTextureShadowLod() const { + return has_texture_shadow_lod; + } + + bool HasVertexBufferUnifiedMemory() const { + return has_vertex_buffer_unified_memory; + } + bool HasASTC() const { return has_astc; } @@ -80,33 +96,47 @@ public: return has_precise_bug; } - bool HasBrokenCompute() const { - return has_broken_compute; - } - bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } + bool HasNvViewportArray2() const { + return has_nv_viewport_array2; + } + + bool UseAssemblyShaders() const { + return use_assembly_shaders; + } + + bool UseAsynchronousShaders() const { + return use_asynchronous_shaders; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); - std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; + std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; + std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; std::size_t uniform_buffer_alignment{}; std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; + u32 max_compute_shared_memory_size{}; bool has_warp_intrinsics{}; bool has_shader_ballot{}; bool has_vertex_viewport_layer{}; bool has_image_load_formatted{}; + bool has_texture_shadow_lod{}; + bool has_vertex_buffer_unified_memory{}; bool has_astc{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; - bool has_broken_compute{}; bool has_fast_buffer_sub_data{}; + bool has_nv_viewport_array2{}; + bool use_assembly_shaders{}; + bool use_asynchronous_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp new file mode 100644 index 000000000..b532fdcc2 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -0,0 +1,73 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" + +#include <glad/glad.h> + +#include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_fence_manager.h" + +namespace OpenGL { + +GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {} + +GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed) + : FenceBase(address, payload, is_stubbed) {} + +GLInnerFence::~GLInnerFence() = default; + +void GLInnerFence::Queue() { + if (is_stubbed) { + return; + } + ASSERT(sync_object.handle == 0); + sync_object.Create(); +} + +bool GLInnerFence::IsSignaled() const { + if (is_stubbed) { + return true; + } + ASSERT(sync_object.handle != 0); + GLsizei length; + GLint sync_status; + glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status); + return sync_status == GL_SIGNALED; +} + +void GLInnerFence::Wait() { + if (is_stubbed) { + return; + } + ASSERT(sync_object.handle != 0); + glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED); +} + +FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCacheOpenGL& texture_cache, + OGLBufferCache& buffer_cache, QueryCache& query_cache) + : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {} + +Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { + return std::make_shared<GLInnerFence>(value, is_stubbed); +} + +Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) { + return std::make_shared<GLInnerFence>(addr, value, is_stubbed); +} + +void FenceManagerOpenGL::QueueFence(Fence& fence) { + fence->Queue(); +} + +bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const { + return fence->IsSignaled(); +} + +void FenceManagerOpenGL::WaitFence(Fence& fence) { + fence->Wait(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h new file mode 100644 index 000000000..da1dcdace --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -0,0 +1,52 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> + +#include "common/common_types.h" +#include "video_core/fence_manager.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_query_cache.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" + +namespace OpenGL { + +class GLInnerFence : public VideoCommon::FenceBase { +public: + GLInnerFence(u32 payload, bool is_stubbed); + GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed); + ~GLInnerFence(); + + void Queue(); + + bool IsSignaled() const; + + void Wait(); + +private: + OGLSync sync_object; +}; + +using Fence = std::shared_ptr<GLInnerFence>; +using GenericFenceManager = + VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>; + +class FenceManagerOpenGL final : public GenericFenceManager { +public: + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache, + QueryCache& query_cache); + +protected: + Fence CreateFence(u32 value, bool is_stubbed) override; + Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override; + void QueueFence(Fence& fence) override; + bool IsFenceSignaled(Fence& fence) const override; + void WaitFence(Fence& fence) override; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index f12e9f55f..1a3d9720e 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -30,12 +30,11 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) - : VideoCommon::QueryCacheBase< - QueryCache, CachedQuery, CounterStream, HostCounter, - std::vector<OGLQuery>>{system, - static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, - gl_rasterizer{gl_rasterizer} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory) + : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>( + rasterizer, maxwell3d, gpu_memory), + gl_rasterizer{rasterizer} {} QueryCache::~QueryCache() = default; @@ -90,13 +89,15 @@ u64 HostCounter::BlockingQuery() const { CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} +CachedQuery::~CachedQuery() = default; + CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { - VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs)); cache = rhs.cache; type = rhs.type; + CachedQueryBase<HostCounter>::operator=(std::move(rhs)); return *this; } diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index d8e7052a1..82cac51ee 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,10 +26,11 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; -class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, - HostCounter, std::vector<OGLQuery>> { +class QueryCache final + : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); + explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -40,6 +41,7 @@ public: private: RasterizerOpenGL& gl_rasterizer; + std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { @@ -62,10 +64,12 @@ class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { public: explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr); - CachedQuery(CachedQuery&& rhs) noexcept; - CachedQuery(const CachedQuery&) = delete; + ~CachedQuery() override; + CachedQuery(CachedQuery&& rhs) noexcept; CachedQuery& operator=(CachedQuery&& rhs) noexcept; + + CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; void Flush() override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f4598fbf7..cfddbde5d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -54,19 +55,36 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { -constexpr std::size_t NumSupportedVertexAttributes = 16; +constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; +constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = + NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; +constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = + NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; + +constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { - if (entry.IsBindless()) { - const Tegra::Texture::TextureHandle tex_handle = - engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset()); - return engine.GetTextureInfo(tex_handle); + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } + if (entry.is_bindless) { + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 offset = - entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); + const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { return engine.GetStageTexture(shader_type, offset); } else { @@ -89,23 +107,84 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +/// Translates hardware transform feedback indices +/// @param location Hardware location +/// @return Pair of ARB_transform_feedback3 token stream first and third arguments +/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt +std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { + const u8 index = location / 4; + if (index >= 8 && index <= 39) { + return {GL_GENERIC_ATTRIB_NV, index - 8}; + } + if (index >= 48 && index <= 55) { + return {GL_TEXTURE_COORD_NV, index - 48}; + } + switch (index) { + case 7: + return {GL_POSITION, 0}; + case 40: + return {GL_PRIMARY_COLOR_NV, 0}; + case 41: + return {GL_SECONDARY_COLOR_NV, 0}; + case 42: + return {GL_BACK_PRIMARY_COLOR_NV, 0}; + case 43: + return {GL_BACK_SECONDARY_COLOR_NV, 0}; + } + UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); + return {GL_POSITION, 0}; +} + void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } +void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) { + if (num_ssbos == 0) { + return; + } + glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos), + reinterpret_cast<const GLuint*>(ssbos)); +} + } // Anonymous namespace -RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, - StateTracker& state_tracker) - : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, - shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, - screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, - buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { +RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory, const Device& device_, + ScreenInfo& screen_info_, ProgramManager& program_manager_, + StateTracker& state_tracker_) + : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()), + kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), + screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), + texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker), + shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device), + query_cache(*this, maxwell3d, gpu_memory), + buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE), + fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), + async_shaders(emu_window) { CheckExtensions(); + + unified_uniform_buffer.Create(); + glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); + + if (device.UseAssemblyShaders()) { + glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); + for (const GLuint cbuf : staging_cbufs) { + glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize), + nullptr, 0); + } + } + + if (device.UseAsynchronousShaders()) { + async_shaders.AllocateWorkers(); + } } -RasterizerOpenGL::~RasterizerOpenGL() {} +RasterizerOpenGL::~RasterizerOpenGL() { + if (device.UseAssemblyShaders()) { + glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); + } +} void RasterizerOpenGL::CheckExtensions() { if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { @@ -116,8 +195,7 @@ void RasterizerOpenGL::CheckExtensions() { } void RasterizerOpenGL::SetupVertexFormat() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { return; } @@ -131,13 +209,13 @@ void RasterizerOpenGL::SetupVertexFormat() { // avoid OpenGL errors. // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't // assume every shader uses them all. - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexFormat0 + index]) { continue; } flags[Dirty::VertexFormat0 + index] = false; - const auto attrib = gpu.regs.vertex_attrib_format[index]; + const auto attrib = maxwell3d.regs.vertex_attrib_format[index]; const auto gl_index = static_cast<GLuint>(index); // Disable constant attributes. @@ -150,9 +228,10 @@ void RasterizerOpenGL::SetupVertexFormat() { if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt || attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) { glVertexAttribIFormat(gl_index, attrib.ComponentCount(), - MaxwellToGL::VertexType(attrib), attrib.offset); + MaxwellToGL::VertexFormat(attrib), attrib.offset); } else { - glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), + glVertexAttribFormat(gl_index, attrib.ComponentCount(), + MaxwellToGL::VertexFormat(attrib), attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); } glVertexAttribBinding(gl_index, attrib.buffer); @@ -160,8 +239,7 @@ void RasterizerOpenGL::SetupVertexFormat() { } void RasterizerOpenGL::SetupVertexBuffer() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexBuffers]) { return; } @@ -169,9 +247,11 @@ void RasterizerOpenGL::SetupVertexBuffer() { MICROPROFILE_SCOPE(OpenGL_VB); + const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); + // Upload all guest vertex arrays sequentially to our buffer - const auto& regs = gpu.regs; - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + const auto& regs = maxwell3d.regs; + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { if (!flags[Dirty::VertexBuffer0 + index]) { continue; } @@ -184,27 +264,37 @@ void RasterizerOpenGL::SetupVertexBuffer() { const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - - ASSERT(end > start); - const u64 size = end - start + 1; - const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); - - // Bind the vertex array to the buffer at the current offset. - vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer, - vertex_buffer_offset, vertex_array.stride); + ASSERT(end >= start); + + const GLuint gl_index = static_cast<GLuint>(index); + const u64 size = end - start; + if (size == 0) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + if (use_unified_memory) { + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); + } + continue; + } + const auto info = buffer_cache.UploadMemory(start, size); + if (use_unified_memory) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, + info.address + info.offset, size); + } else { + glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); + } } } void RasterizerOpenGL::SetupVertexInstances() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexInstances]) { return; } flags[Dirty::VertexInstances] = false; - const auto& regs = gpu.regs; - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + const auto& regs = maxwell3d.regs; + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexInstance0 + index]) { continue; } @@ -219,24 +309,23 @@ void RasterizerOpenGL::SetupVertexInstances() { GLintptr RasterizerOpenGL::SetupIndexBuffer() { MICROPROFILE_SCOPE(OpenGL_Index); - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; const std::size_t size = CalculateIndexBufferSize(); - const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - vertex_array_pushbuffer.SetIndexBuffer(buffer); - return offset; + const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); + return info.offset; } void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); - auto& gpu = system.GPU().Maxwell3D(); u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - const auto& shader_config = gpu.regs.shader_config[index]; + const auto& shader_config = maxwell3d.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { switch (program) { case Maxwell::ShaderProgram::Geometry: program_manager.UseGeometryShader(0); @@ -251,23 +340,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } // Currently this stages are not supported in the OpenGL backend. - // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL - if (program == Maxwell::ShaderProgram::TesselationControl) { - continue; - } else if (program == Maxwell::ShaderProgram::TesselationEval) { + // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL + if (program == Maxwell::ShaderProgram::TesselationControl || + program == Maxwell::ShaderProgram::TesselationEval) { continue; } - Shader shader{shader_cache.GetStageProgram(program)}; + Shader* const shader = shader_cache.GetStageProgram(program, async_shaders); - // Stage indices are 0 - 5 - const std::size_t stage = index == 0 ? 0 : index - 1; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); - SetupDrawTextures(stage, shader); - SetupDrawImages(stage, shader); - - const GLuint program_handle = shader->GetHandle(); + const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: @@ -284,6 +365,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { shader_config.enable.Value(), shader_config.offset); } + // Stage indices are 0 - 5 + const std::size_t stage = index == 0 ? 0 : index - 1; + SetupDrawConstBuffers(stage, shader); + SetupDrawGlobalMemory(stage, shader); + SetupDrawTextures(stage, shader); + SetupDrawImages(stage, shader); + // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -298,11 +386,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } SyncClipEnabled(clip_distances); - gpu.dirty.flags[Dirty::Shaders] = false; + maxwell3d.dirty.flags[Dirty::Shaders] = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; std::size_t size = 0; for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { @@ -312,49 +400,42 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { const GPUVAddr start = regs.vertex_array[index].StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end > start); - size += end - start + 1; + size += end - start; + ASSERT(end >= start); } return size; } std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - - return static_cast<std::size_t>(regs.index_array.count) * - static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); + return static_cast<std::size_t>(maxwell3d.regs.index_array.count) * + static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); } -void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, +void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { - shader_cache.LoadDiskCache(stop_loading, callback); -} - -void RasterizerOpenGL::SetupDirtyFlags() { - state_tracker.Initialize(); + shader_cache.LoadDiskCache(title_id, stop_loading, callback); } void RasterizerOpenGL::ConfigureFramebuffers() { MICROPROFILE_SCOPE(OpenGL_Framebuffer); - auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) { return; } - gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); - View depth_surface = texture_cache.GetDepthBufferSurface(); + View depth_surface = texture_cache.GetDepthBufferSurface(true); - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); // Bind the framebuffer surfaces FramebufferCacheKey key; const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); for (std::size_t index = 0; index < colors_count; ++index) { - View color_surface{texture_cache.GetColorBufferSurface(index)}; + View color_surface{texture_cache.GetColorBufferSurface(index, true)}; if (!color_surface) { continue; } @@ -378,40 +459,62 @@ void RasterizerOpenGL::ConfigureFramebuffers() { glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } -void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, - bool using_stencil_fb) { - auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; +void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) { + const auto& regs = maxwell3d.regs; texture_cache.GuardRenderTargets(true); View color_surface; - if (using_color_fb) { + + if (using_color) { + // Determine if we have to preserve the contents. + // First we have to make sure all clear masks are enabled. + bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G || + !regs.clear_buffers.B || !regs.clear_buffers.A; const std::size_t index = regs.clear_buffers.RT; - color_surface = texture_cache.GetColorBufferSurface(index); + if (regs.clear_flags.scissor) { + // Then we have to confirm scissor testing clears the whole image. + const auto& scissor = regs.scissor_test[0]; + preserve_contents |= scissor.min_x > 0; + preserve_contents |= scissor.min_y > 0; + preserve_contents |= scissor.max_x < regs.rt[index].width; + preserve_contents |= scissor.max_y < regs.rt[index].height; + } + + color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents); texture_cache.MarkColorBufferInUse(index); } + View depth_surface; - if (using_depth_fb || using_stencil_fb) { - depth_surface = texture_cache.GetDepthBufferSurface(); + if (using_depth_stencil) { + bool preserve_contents = false; + if (regs.clear_flags.scissor) { + // For depth stencil clears we only have to confirm scissor test covers the whole image. + const auto& scissor = regs.scissor_test[0]; + preserve_contents |= scissor.min_x > 0; + preserve_contents |= scissor.min_y > 0; + preserve_contents |= scissor.max_x < regs.zeta_width; + preserve_contents |= scissor.max_y < regs.zeta_height; + } + + depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); texture_cache.MarkDepthBufferInUse(); } texture_cache.GuardRenderTargets(false); FramebufferCacheKey key; - key.colors[0] = color_surface; - key.zeta = depth_surface; + key.colors[0] = std::move(color_surface); + key.zeta = std::move(depth_surface); state_tracker.NotifyFramebuffer(); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } void RasterizerOpenGL::Clear() { - const auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.ShouldExecute()) { + if (!maxwell3d.ShouldExecute()) { return; } - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; @@ -419,8 +522,7 @@ void RasterizerOpenGL::Clear() { if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; - } - if (use_color) { + state_tracker.NotifyColorMask0(); glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); @@ -458,7 +560,7 @@ void RasterizerOpenGL::Clear() { UNIMPLEMENTED_IF(regs.clear_flags.viewport); - ConfigureClearFramebuffer(use_color, use_depth, use_stencil); + ConfigureClearFramebuffer(use_color, use_depth || use_stencil); if (use_color) { glClearBufferfv(GL_COLOR, 0, regs.clear_color); @@ -477,7 +579,6 @@ void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); - auto& gpu = system.GPU().Maxwell3D(); query_cache.UpdateCounters(); @@ -502,6 +603,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { SyncFramebufferSRGB(); buffer_cache.Acquire(); + current_cbuf = 0; std::size_t buffer_size = CalculateVertexArraysSize(); @@ -511,20 +613,28 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } // Uniform space for the 5 shader stages - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * - Maxwell::MaxShaderStage; + buffer_size = + Common::AlignUp<std::size_t>(buffer_size, 4) + + (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers buffer_size += Maxwell::MaxConstBuffers * (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - buffer_cache.Map(buffer_size); + const bool invalidated = buffer_cache.Map(buffer_size); + + if (invalidated) { + // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + dirty[index] = true; + } + } // Prepare vertex array format. SetupVertexFormat(); - vertex_array_pushbuffer.Setup(); // Upload vertex and index data. SetupVertexBuffer(); @@ -534,21 +644,19 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { index_buffer_offset = SetupIndexBuffer(); } - // Prepare packed bindings. - bind_ubo_pushbuffer.Setup(); - bind_ssbo_pushbuffer.Setup(); - // Setup emulation uniform buffer. - GLShader::MaxwellUniformData ubo; - ubo.SetFromRegs(gpu); - const auto [buffer, offset] = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - bind_ubo_pushbuffer.Push(EmulationUniformBlockBinding, buffer, offset, - static_cast<GLsizeiptr>(sizeof(ubo))); + if (!device.UseAssemblyShaders()) { + MaxwellUniformData ubo; + ubo.SetFromRegs(maxwell3d); + const auto info = + buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); + glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, + static_cast<GLsizeiptr>(sizeof(ubo))); + } // Setup shaders and their used resources. texture_cache.GuardSamplers(true); - const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); SetupShaders(primitive_mode); texture_cache.GuardSamplers(false); @@ -557,11 +665,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { // Signal the buffer cache that we are not going to upload more things. buffer_cache.Unmap(); - // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. - vertex_array_pushbuffer.Bind(); - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - program_manager.BindGraphicsPipeline(); if (texture_cache.TextureBarrier()) { @@ -570,14 +673,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { BeginTransformFeedback(primitive_mode); - const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); + const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance); const GLsizei num_instances = - static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); + static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1); if (is_indexed) { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); - const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { glDrawElements(primitive_mode, num_vertices, format, offset); } else if (num_instances == 1 && base_instance == 0) { @@ -596,8 +699,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { base_instance); } } else { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count); if (num_instances == 1 && base_instance == 0) { glDrawArrays(primitive_mode, base_vertex, num_vertices); } else if (base_instance == 0) { @@ -611,37 +714,32 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { EndTransformFeedback(); ++num_queued_commands; + + gpu.TickWork(); } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - if (device.HasBrokenCompute()) { - return; - } - buffer_cache.Acquire(); + current_cbuf = 0; auto kernel = shader_cache.GetComputeKernel(code_addr); + program_manager.BindCompute(kernel->GetHandle()); + SetupComputeTextures(kernel); SetupComputeImages(kernel); - program_manager.BindComputeShader(kernel->GetHandle()); const std::size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); buffer_cache.Map(buffer_size); - bind_ubo_pushbuffer.Setup(); - bind_ssbo_pushbuffer.Setup(); - SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); buffer_cache.Unmap(); - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; + program_manager.BindCompute(kernel->GetHandle()); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -667,6 +765,13 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { query_cache.FlushRegion(addr, size); } +bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { + if (!Settings::IsGPULevelHigh()) { + return buffer_cache.MustFlushRegion(addr, size); + } + return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); +} + void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); if (addr == 0 || size == 0) { @@ -678,13 +783,64 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { query_cache.InvalidateRegion(addr, size); } +void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + if (addr == 0 || size == 0) { + return; + } + texture_cache.OnCPUWrite(addr, size); + shader_cache.OnCPUWrite(addr, size); + buffer_cache.OnCPUWrite(addr, size); +} + +void RasterizerOpenGL::SyncGuestHost() { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + texture_cache.SyncGuestHost(); + buffer_cache.SyncGuestHost(); + shader_cache.SyncGuestHost(); +} + +void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { + if (!gpu.IsAsync()) { + gpu_memory.Write<u32>(addr, value); + return; + } + fence_manager.SignalSemaphore(addr, value); +} + +void RasterizerOpenGL::SignalSyncPoint(u32 value) { + if (!gpu.IsAsync()) { + gpu.IncrementSyncPoint(value); + return; + } + fence_manager.SignalSyncPoint(value); +} + +void RasterizerOpenGL::ReleaseFences() { + if (!gpu.IsAsync()) { + return; + } + fence_manager.WaitPendingFences(); +} + void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { - if (Settings::values.use_accurate_gpu_emulation) { + if (Settings::IsGPULevelExtreme()) { FlushRegion(addr, size); } InvalidateRegion(addr, size); } +void RasterizerOpenGL::WaitForIdle() { + // Place a barrier on everything that is not framebuffer related. + // This is related to another flag that is not currently implemented. + glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | + GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | + GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | + GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | + GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | + GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); +} + void RasterizerOpenGL::FlushCommands() { // Only flush when we have commands queued to OpenGL. if (num_queued_commands == 0) { @@ -739,40 +895,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { + static constexpr std::array PARAMETER_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& stages = system.GPU().Maxwell3D().state.shader_stages; + const auto& stages = maxwell3d.state.shader_stages; const auto& shader_stage = stages[stage_index]; - - u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetEntries().const_buffers) { - const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + const auto& entries = shader->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; + const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; + + const auto base_bindings = device.GetBaseBindings(stage_index); + u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; + for (const auto& entry : entries.const_buffers) { + const u32 index = entry.GetIndex(); + const auto& buffer = shader_stage.const_buffers[index]; + SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, + base_unified_offset + index * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + + entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, + base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; + const auto& entries = kernel->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; u32 binding = 0; - for (const auto& entry : kernel->GetEntries().const_buffers) { + for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; buffer.address = config.Address(); buffer.size = config.size; buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, + use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, + NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } -void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { +void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, + const Tegra::Engines::ConstBufferInfo& buffer, + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset) { if (!buffer.enabled) { // Set values to zero to unbind buffers - bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, - sizeof(float)); + if (device.UseAssemblyShaders()) { + glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); + } return; } @@ -780,68 +968,112 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const // UBO alignment requirements. const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const auto alignment = device.GetUniformBufferAlignment(); - const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, - device.HasFastBufferSubData()); - bind_ubo_pushbuffer.Push(binding, cbuf, offset, size); + const bool fast_upload = !use_unified && device.HasFastBufferSubData(); + + const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); + const GPUVAddr gpu_addr = buffer.address; + auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + + if (device.UseAssemblyShaders()) { + UNIMPLEMENTED_IF(use_unified); + if (info.offset != 0) { + const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; + glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); + info.handle = staging_cbuf; + info.offset = 0; + } + glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); + return; + } + + if (use_unified) { + glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, + unified_offset, size); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); + } } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { + static constexpr std::array TARGET_LUT = { + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, + }; + + const auto& cbufs{maxwell3d.state.shader_stages[stage_index]}; + const auto& entries{shader->GetEntries().global_memory_entries}; - u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : shader->GetEntries().global_memory_entries) { - const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; - const auto gpu_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); + + const bool assembly_shaders = device.UseAssemblyShaders(); + u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; + for (const auto& entry : entries) { + const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; + const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; + const u32 size{gpu_memory.Read<u32>(addr + 8)}; + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); + ++binding; + } + if (assembly_shaders) { + UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size()); } } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { + const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; + const auto& entries{kernel->GetEntries().global_memory_entries}; + + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); u32 binding = 0; - for (const auto& entry : kernel->GetEntries().global_memory_entries) { - const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; - const auto gpu_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + for (const auto& entry : entries) { + const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; + const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; + const u32 size{gpu_memory.Read<u32>(addr + 8)}; + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); + ++binding; + } + if (device.UseAssemblyShaders()) { + UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size()); } } void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, std::size_t size) { - const auto alignment{device.GetShaderStorageBufferAlignment()}; - const auto [ssbo, buffer_offset] = - buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten()); - bind_ssbo_pushbuffer.Push(binding, ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); + GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) { + const size_t alignment{device.GetShaderStorageBufferAlignment()}; + const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); + if (device.UseAssemblyShaders()) { + *ssbo = BindlessSSBO{ + .address = static_cast<GLuint64EXT>(info.address + info.offset), + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, + static_cast<GLsizeiptr>(size)); + } } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; for (const auto& entry : shader->GetEntries().samplers) { const auto shader_type = static_cast<ShaderType>(stage_index); - for (std::size_t i = 0; i < entry.Size(); ++i) { + for (std::size_t i = 0; i < entry.size; ++i) { const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); SetupTexture(binding++, texture, entry); } } } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : kernel->GetEntries().samplers) { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); + for (std::size_t i = 0; i < entry.size; ++i) { + const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i); SetupTexture(binding++, texture, entry); } } @@ -856,33 +1088,27 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu glBindTextureUnit(binding, 0); return; } - glBindTextureUnit(binding, view->GetTexture()); - - if (view->GetSurfaceParams().IsBuffer()) { - return; + const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); + glBindTextureUnit(binding, handle); + if (!view->GetSurfaceParams().IsBuffer()) { + glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } - // Apply swizzle to textures that are not buffers. - view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, - texture.tic.w_source); - - glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { - const auto& maxwell3d = system.GPU().Maxwell3D(); +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { u32 binding = device.GetBaseBindings(stage_index).image; for (const auto& entry : shader->GetEntries().images) { - const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); + const auto shader_type = static_cast<ShaderType>(stage_index); const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; SetupImage(binding++, tic, entry); } } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { - const auto& compute = system.GPU().KeplerCompute(); +void RasterizerOpenGL::SetupComputeImages(Shader* shader) { u32 binding = 0; for (const auto& entry : shader->GetEntries().images) { - const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; + const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic; SetupImage(binding++, tic, entry); } } @@ -894,27 +1120,43 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); return; } - if (!tic.IsBuffer()) { - view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - } - if (entry.IsWritten()) { + if (entry.is_written) { view->MarkAsModified(texture_cache.Tick()); } - glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE, - view->GetFormat()); + const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat()); } void RasterizerOpenGL::SyncViewport() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; const bool dirty_viewport = flags[Dirty::Viewports]; + const bool dirty_clip_control = flags[Dirty::ClipControl]; + + if (dirty_clip_control || flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + + GLenum mode = MaxwellToGL::FrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0 && + regs.viewport_transform[0].scale_y < 0.0f) { + switch (mode) { + case GL_CW: + mode = GL_CCW; + break; + case GL_CCW: + mode = GL_CW; + break; + } + } + glFrontFace(mode); + } + if (dirty_viewport || flags[Dirty::ClipControl]) { flags[Dirty::ClipControl] = false; bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { + if (regs.viewport_transform[0].scale_y < 0.0f) { flip_y = !flip_y; } if (regs.screen_y_control.y_negate != 0) { @@ -946,34 +1188,36 @@ void RasterizerOpenGL::SyncViewport() { const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; const GLdouble far_depth = src.translate_z + src.scale_z; glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth); + + if (!GLAD_GL_NV_viewport_swizzle) { + continue; + } + glViewportSwizzleNV(static_cast<GLuint>(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x), + MaxwellToGL::ViewportSwizzle(src.swizzle.y), + MaxwellToGL::ViewportSwizzle(src.swizzle.z), + MaxwellToGL::ViewportSwizzle(src.swizzle.w)); } } } void RasterizerOpenGL::SyncDepthClamp() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::DepthClampEnabled]) { return; } flags[Dirty::DepthClampEnabled] = false; - const auto& state = gpu.regs.view_volume_clip_control; - UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near, - "Unimplemented depth clamp separation!"); - - oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near); + oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0); } void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) { return; } flags[Dirty::ClipDistances] = false; - clip_mask &= gpu.regs.clip_distance_enabled; + clip_mask &= maxwell3d.regs.clip_distance_enabled; if (clip_mask == last_clip_distance_mask) { return; } @@ -989,9 +1233,8 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::CullTest]) { flags[Dirty::CullTest] = false; @@ -1003,34 +1246,27 @@ void RasterizerOpenGL::SyncCullMode() { glDisable(GL_CULL_FACE); } } - - if (flags[Dirty::FrontFace]) { - flags[Dirty::FrontFace] = false; - glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); - } } void RasterizerOpenGL::SyncPrimitiveRestart() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PrimitiveRestart]) { return; } flags[Dirty::PrimitiveRestart] = false; - if (gpu.regs.primitive_restart.enabled) { + if (maxwell3d.regs.primitive_restart.enabled) { glEnable(GL_PRIMITIVE_RESTART); - glPrimitiveRestartIndex(gpu.regs.primitive_restart.index); + glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index); } else { glDisable(GL_PRIMITIVE_RESTART); } } void RasterizerOpenGL::SyncDepthTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; - const auto& regs = gpu.regs; if (flags[Dirty::DepthMask]) { flags[Dirty::DepthMask] = false; glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE); @@ -1048,14 +1284,13 @@ void RasterizerOpenGL::SyncDepthTestState() { } void RasterizerOpenGL::SyncStencilTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::StencilTest]) { return; } flags[Dirty::StencilTest] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_STENCIL_TEST, regs.stencil_enable); glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func), @@ -1080,25 +1315,24 @@ void RasterizerOpenGL::SyncStencilTestState() { } void RasterizerOpenGL::SyncRasterizeEnable() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::RasterizeEnable]) { return; } flags[Dirty::RasterizeEnable] = false; - oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); + oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0); } void RasterizerOpenGL::SyncPolygonModes() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonModes]) { return; } flags[Dirty::PolygonModes] = false; - if (gpu.regs.fill_rectangle) { + const auto& regs = maxwell3d.regs; + if (regs.fill_rectangle) { if (!GLAD_GL_NV_fill_rectangle) { LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported"); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); @@ -1111,27 +1345,26 @@ void RasterizerOpenGL::SyncPolygonModes() { return; } - if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) { + if (regs.polygon_mode_front == regs.polygon_mode_back) { flags[Dirty::PolygonModeFront] = false; flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); return; } if (flags[Dirty::PolygonModeFront]) { flags[Dirty::PolygonModeFront] = false; - glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); } if (flags[Dirty::PolygonModeBack]) { flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back)); + glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back)); } } void RasterizerOpenGL::SyncColorMask() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ColorMasks]) { return; } @@ -1140,7 +1373,7 @@ void RasterizerOpenGL::SyncColorMask() { const bool force = flags[Dirty::ColorMaskCommon]; flags[Dirty::ColorMaskCommon] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.color_mask_common) { if (!force && !flags[Dirty::ColorMask0]) { return; @@ -1165,33 +1398,30 @@ void RasterizerOpenGL::SyncColorMask() { } void RasterizerOpenGL::SyncMultiSampleState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::MultisampleControl]) { return; } flags[Dirty::MultisampleControl] = false; - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage); oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one); } void RasterizerOpenGL::SyncFragmentColorClampState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FragmentClampColor]) { return; } flags[Dirty::FragmentClampColor] = false; - glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); + glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); } void RasterizerOpenGL::SyncBlendState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::BlendColor]) { flags[Dirty::BlendColor] = false; @@ -1248,14 +1478,13 @@ void RasterizerOpenGL::SyncBlendState() { } void RasterizerOpenGL::SyncLogicOpState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LogicOp]) { return; } flags[Dirty::LogicOp] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.logic_op.enable) { glEnable(GL_COLOR_LOGIC_OP); glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation)); @@ -1265,14 +1494,13 @@ void RasterizerOpenGL::SyncLogicOpState() { } void RasterizerOpenGL::SyncScissorTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::Scissors]) { return; } flags[Dirty::Scissors] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!flags[Dirty::Scissor0 + index]) { continue; @@ -1291,16 +1519,15 @@ void RasterizerOpenGL::SyncScissorTest() { } void RasterizerOpenGL::SyncPointState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PointSize]) { return; } flags[Dirty::PointSize] = false; - oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable); + oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); - if (gpu.regs.vp_point_size.enable) { + if (maxwell3d.regs.vp_point_size.enable) { // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. glEnable(GL_PROGRAM_POINT_SIZE); return; @@ -1308,32 +1535,30 @@ void RasterizerOpenGL::SyncPointState() { // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). - glPointSize(std::max(1.0f, gpu.regs.point_size)); + glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); glDisable(GL_PROGRAM_POINT_SIZE); } void RasterizerOpenGL::SyncLineState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LineWidth]) { return; } flags[Dirty::LineWidth] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable); glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased); } void RasterizerOpenGL::SyncPolygonOffset() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonOffset]) { return; } flags[Dirty::PolygonOffset] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable); oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable); oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable); @@ -1347,18 +1572,13 @@ void RasterizerOpenGL::SyncPolygonOffset() { } void RasterizerOpenGL::SyncAlphaTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::AlphaTest]) { return; } flags[Dirty::AlphaTest] = false; - const auto& regs = gpu.regs; - if (regs.alpha_test_enabled && regs.rt_control.count > 1) { - LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); - } - + const auto& regs = maxwell3d.regs; if (regs.alpha_test_enabled) { glEnable(GL_ALPHA_TEST); glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); @@ -1368,22 +1588,79 @@ void RasterizerOpenGL::SyncAlphaTest() { } void RasterizerOpenGL::SyncFramebufferSRGB() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FramebufferSRGB]) { return; } flags[Dirty::FramebufferSRGB] = false; - oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); + oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb); +} + +void RasterizerOpenGL::SyncTransformFeedback() { + // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal + // when this is required. + const auto& regs = maxwell3d.regs; + + static constexpr std::size_t STRIDE = 3; + std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs; + std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams; + + GLint* cursor = attribs.data(); + GLint* current_stream = streams.data(); + + for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { + const auto& layout = regs.tfb_layouts[feedback]; + UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); + if (layout.varying_count == 0) { + continue; + } + + *current_stream = static_cast<GLint>(feedback); + if (current_stream != streams.data()) { + // When stepping one stream, push the expected token + cursor[0] = GL_NEXT_BUFFER_NV; + cursor[1] = 0; + cursor[2] = 0; + cursor += STRIDE; + } + ++current_stream; + + const auto& locations = regs.tfb_varying_locs[feedback]; + std::optional<u8> current_index; + for (u32 offset = 0; offset < layout.varying_count; ++offset) { + const u8 location = locations[offset]; + const u8 index = location / 4; + + if (current_index == index) { + // Increase number of components of the previous attachment + ++cursor[-2]; + continue; + } + current_index = index; + + std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location); + cursor[1] = 1; + cursor += STRIDE; + } + } + + const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE); + const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data()); + glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(), + GL_INTERLEAVED_ATTRIBS); } void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } + if (device.UseAssemblyShaders()) { + SyncTransformFeedback(); + } + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); @@ -1410,11 +1687,15 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { static_cast<GLsizeiptr>(size)); } + // We may have to call BeginTransformFeedbackNV here since they seem to call different + // implementations on Nvidia's driver (the pointer is different) but we are using + // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB + // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works. glBeginTransformFeedback(GL_POINTS); } void RasterizerOpenGL::EndTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } @@ -1431,8 +1712,9 @@ void RasterizerOpenGL::EndTransformFeedback() { const GLuint handle = transform_feedback_buffers[index].handle; const GPUVAddr gpu_addr = binding.Address(); const std::size_t size = binding.buffer_size; - const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, + static_cast<GLsizeiptr>(size)); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 435da4425..1d0f585fa 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,10 +19,10 @@ #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_fence_manager.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -33,10 +33,11 @@ #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" -namespace Core { -class System; +namespace Core::Memory { +class Memory; } namespace Core::Frontend { @@ -52,10 +53,18 @@ namespace OpenGL { struct ScreenInfo; struct DrawParameters; +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); + class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, + explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, + Core::Memory::Memory& cpu_memory, const Device& device, + ScreenInfo& screen_info, ProgramManager& program_manager, StateTracker& state_tracker); ~RasterizerOpenGL() override; @@ -66,8 +75,15 @@ public: void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; + bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; + void OnCPUWrite(VAddr addr, u64 size) override; + void SyncGuestHost() override; + void SignalSemaphore(GPUVAddr addr, u32 value) override; + void SignalSyncPoint(u32 value) override; + void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; + void WaitForIdle() override; void FlushCommands() override; void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, @@ -75,56 +91,65 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void LoadDiskResources(const std::atomic_bool& stop_loading, + void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; - void SetupDirtyFlags() override; /// Returns true when there are commands queued to the OpenGL server. bool AnyCommandQueued() const { return num_queued_commands > 0; } + VideoCommon::Shader::AsyncShaders& GetAsyncShaders() { + return async_shaders; + } + + const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const { + return async_shaders; + } + private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); - void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb); + /// Configures the color and depth framebuffer for clearing. + void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); + void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(const Shader& kernel); + void SetupComputeConstBuffers(Shader* kernel); /// Configures a constant buffer. - void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry); + void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); + void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(const Shader& kernel); + void SetupComputeGlobalMemory(Shader* kernel); - /// Configures a constant buffer. + /// Configures a global memory buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - std::size_t size); + size_t size, BindlessSSBO* ssbo); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, const Shader& shader); + void SetupDrawTextures(std::size_t stage_index, Shader* shader); /// Configures the textures used in a compute shader. - void SetupComputeTextures(const Shader& kernel); + void SetupComputeTextures(Shader* kernel); /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, const Shader& shader); + void SetupDrawImages(std::size_t stage_index, Shader* shader); /// Configures images in a compute shader. - void SetupComputeImages(const Shader& shader); + void SetupComputeImages(Shader* shader); /// Configures an image. void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); @@ -192,6 +217,10 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs transform feedback state to match guest state + /// @note Only valid on assembly shaders + void SyncTransformFeedback(); + /// Begin a transform feedback void BeginTransformFeedback(GLenum primitive_mode); @@ -215,31 +244,42 @@ private: void SetupShaders(GLenum primitive_mode); - const Device device; + Tegra::GPU& gpu; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + + const Device& device; + ScreenInfo& screen_info; + ProgramManager& program_manager; + StateTracker& state_tracker; TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; SamplerCacheOpenGL sampler_cache; FramebufferCacheOpenGL framebuffer_cache; QueryCache query_cache; + OGLBufferCache buffer_cache; + FenceManagerOpenGL fence_manager; - Core::System& system; - ScreenInfo& screen_info; - GLShader::ProgramManager& program_manager; - StateTracker& state_tracker; + VideoCommon::Shader::AsyncShaders async_shaders; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; - OGLBufferCache buffer_cache; - VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker}; - BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; - BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + GLint vertex_binding = 0; std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> transform_feedback_buffers; std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> enabled_transform_feedback_buffers; + static constexpr std::size_t NUM_CONSTANT_BUFFERS = + Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; + std::size_t current_cbuf = 0; + OGLBuffer unified_uniform_buffer; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 97803d480..0ebcec427 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <string_view> #include <utility> #include <glad/glad.h> #include "common/common_types.h" @@ -82,11 +83,13 @@ void OGLSampler::Release() { handle = 0; } -void OGLShader::Create(const char* source, GLenum type) { - if (handle != 0) +void OGLShader::Create(std::string_view source, GLenum type) { + if (handle != 0) { return; - if (source == nullptr) + } + if (source.empty()) { return; + } MICROPROFILE_SCOPE(OpenGL_ResourceCreation); handle = GLShader::LoadShader(source, type); @@ -125,6 +128,15 @@ void OGLProgram::Release() { handle = 0; } +void OGLAssemblyProgram::Release() { + if (handle == 0) { + return; + } + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteProgramsARB(1, &handle); + handle = 0; +} + void OGLPipeline::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index de93f4212..f48398669 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -4,6 +4,7 @@ #pragma once +#include <string_view> #include <utility> #include <glad/glad.h> #include "common/common_types.h" @@ -127,7 +128,7 @@ public: return *this; } - void Create(const char* source, GLenum type); + void Create(std::string_view source, GLenum type); void Release(); @@ -167,6 +168,28 @@ public: GLuint handle = 0; }; +class OGLAssemblyProgram : private NonCopyable { +public: + OGLAssemblyProgram() = default; + + OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLAssemblyProgram() { + Release(); + } + + OGLAssemblyProgram& operator=(OGLAssemblyProgram&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + class OGLPipeline : private NonCopyable { public: OGLPipeline() = default; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 12c6dcfde..bd56bed0c 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -10,8 +10,6 @@ #include <thread> #include <unordered_set> -#include <boost/functional/hash.hpp> - #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -22,83 +20,35 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" +#include "video_core/shader_notify.h" namespace OpenGL { using Tegra::Engines::ShaderType; -using VideoCommon::Shader::CompileDepth; -using VideoCommon::Shader::CompilerSettings; +using VideoCommon::Shader::GetShaderAddress; +using VideoCommon::Shader::GetShaderCode; +using VideoCommon::Shader::GetUniqueIdentifier; +using VideoCommon::Shader::KERNEL_MAIN_OFFSET; using VideoCommon::Shader::ProgramCode; using VideoCommon::Shader::Registry; using VideoCommon::Shader::ShaderIR; +using VideoCommon::Shader::STAGE_MAIN_OFFSET; namespace { -constexpr u32 STAGE_MAIN_OFFSET = 10; -constexpr u32 KERNEL_MAIN_OFFSET = 0; - -constexpr CompilerSettings COMPILER_SETTINGS{CompileDepth::FullDecompile}; - -/// Gets the address for the specified shader stage program -GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) { - const auto& gpu{system.GPU().Maxwell3D()}; - const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; - return gpu.regs.code_address.CodeAddress() + shader_config.offset; -} - -/// Gets if the current instruction offset is a scheduler instruction -constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { - // Sched instructions appear once every 4 instructions. - constexpr std::size_t SchedPeriod = 4; - const std::size_t absolute_offset = offset - main_offset; - return (absolute_offset % SchedPeriod) == 0; -} - -/// Calculates the size of a program stream -std::size_t CalculateProgramSize(const ProgramCode& program) { - constexpr std::size_t start_offset = 10; - // This is the encoded version of BRA that jumps to itself. All Nvidia - // shaders end with one. - constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL; - constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL; - std::size_t offset = start_offset; - while (offset < program.size()) { - const u64 instruction = program[offset]; - if (!IsSchedInstruction(offset, start_offset)) { - if ((instruction & mask) == self_jumping_branch) { - // End on Maxwell's "nop" instruction - break; - } - if (instruction == 0) { - break; - } - } - offset++; - } - // The last instruction is included in the program size - return std::min(offset + 1, program.size()); -} - -/// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr, - const u8* host_ptr) { - ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); - ASSERT_OR_EXECUTE(host_ptr != nullptr, { - std::fill(code.begin(), code.end(), 0); - return code; - }); - memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64)); - code.resize(CalculateProgramSize(code)); - return code; -} +constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{}; /// Gets the shader type from a Maxwell program type constexpr GLenum GetGLShaderType(ShaderType shader_type) { @@ -116,17 +66,6 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) { } } -/// Hashes one (or two) program streams -u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code, - const ProgramCode& code_b = {}) { - u64 unique_identifier = boost::hash_value(code); - if (is_a) { - // VertexA programs include two programs - boost::hash_combine(unique_identifier, boost::hash_value(code_b)); - } - return unique_identifier; -} - constexpr const char* GetShaderTypeName(ShaderType shader_type) { switch (shader_type) { case ShaderType::Vertex: @@ -162,6 +101,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) { return {}; } +constexpr GLenum AssemblyEnum(ShaderType shader_type) { + switch (shader_type) { + case ShaderType::Vertex: + return GL_VERTEX_PROGRAM_NV; + case ShaderType::TesselationControl: + return GL_TESS_CONTROL_PROGRAM_NV; + case ShaderType::TesselationEval: + return GL_TESS_EVALUATION_PROGRAM_NV; + case ShaderType::Geometry: + return GL_GEOMETRY_PROGRAM_NV; + case ShaderType::Fragment: + return GL_FRAGMENT_PROGRAM_NV; + case ShaderType::Compute: + return GL_COMPUTE_PROGRAM_NV; + } + return {}; +} + std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); } @@ -170,7 +127,7 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size}; const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer, entry.graphics_info, entry.compute_info}; - const auto registry = std::make_shared<Registry>(entry.type, info); + auto registry = std::make_shared<Registry>(entry.type, info); for (const auto& [address, value] : entry.keys) { const auto [buffer, offset] = address; registry->InsertKey(buffer, offset, value); @@ -185,21 +142,6 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { return registry; } -std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, - u64 unique_identifier, const ShaderIR& ir, - const Registry& registry, bool hint_retrievable = false) { - const std::string shader_id = MakeShaderID(unique_identifier, shader_type); - LOG_INFO(Render_OpenGL, "{}", shader_id); - - const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); - OGLShader shader; - shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); - - auto program = std::make_shared<OGLProgram>(); - program->Create(true, hint_retrievable, shader.handle); - return program; -} - std::unordered_set<GLenum> GetSupportedFormats() { GLint num_formats; glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); @@ -216,55 +158,138 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, std::shared_ptr<OGLProgram> program) - : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program)} {} +ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, + const ShaderIR& ir, const Registry& registry, bool hint_retrievable) { + const std::string shader_id = MakeShaderID(unique_identifier, shader_type); + LOG_INFO(Render_OpenGL, "{}", shader_id); + + auto program = std::make_shared<ProgramHandle>(); + + if (device.UseAssemblyShaders()) { + const std::string arb = + DecompileAssemblyShader(device, ir, registry, shader_type, shader_id); + + GLuint& arb_prog = program->assembly_program.handle; + +// Commented out functions signal OpenGL errors but are compatible with apitrace. +// Use them only to capture and replay on apitrace. +#if 0 + glGenProgramsNV(1, &arb_prog); + glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()), + reinterpret_cast<const GLubyte*>(arb.data())); +#else + glGenProgramsARB(1, &arb_prog); + glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB, + static_cast<GLsizei>(arb.size()), arb.data()); +#endif + const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV)); + if (err && *err) { + LOG_CRITICAL(Render_OpenGL, "{}", err); + LOG_INFO(Render_OpenGL, "\n{}", arb); + } + } else { + const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); + OGLShader shader; + shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); + + program->source_program.Create(true, hint_retrievable, shader.handle); + } -CachedShader::~CachedShader() = default; + return program; +} -GLuint CachedShader::GetHandle() const { +Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_, bool is_built) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)}, + is_built(is_built) { + handle = program->assembly_program.handle; + if (handle == 0) { + handle = program->source_program.handle; + } + if (is_built) { + ASSERT(handle != 0); + } +} + +Shader::~Shader() = default; + +GLuint Shader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); - return program->handle; + return handle; } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, ProgramCode code, - ProgramCode code_b) { +bool Shader::IsBuilt() const { + return is_built; +} + +void Shader::AsyncOpenGLBuilt(OGLProgram new_program) { + program->source_program = std::move(new_program); + handle = program->source_program.handle; + is_built = true; +} + +void Shader::AsyncGLASMBuilt(OGLAssemblyProgram new_program) { + program->assembly_program = std::move(new_program); + handle = program->assembly_program.handle; + is_built = true; +} + +std::unique_ptr<Shader> Shader::CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code, + ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) { const auto shader_type = GetShaderType(program_type); - const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D()); - const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); - // TODO(Rodrigo): Handle VertexA shaders - // std::optional<ShaderIR> ir_b; - // if (!code_b.empty()) { - // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); - // } - auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + auto& gpu = params.gpu; + gpu.ShaderNotify().MarkSharderBuilding(); + + auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D()); + if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) { + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + // TODO(Rodrigo): Handle VertexA shaders + // std::optional<ShaderIR> ir_b; + // if (!code_b.empty()) { + // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); + // } + auto program = + BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + ShaderDiskCacheEntry entry; + entry.type = shader_type; + entry.code = std::move(code); + entry.code_b = std::move(code_b); + entry.unique_identifier = params.unique_identifier; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.graphics_info = registry->GetGraphicsInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + gpu.ShaderNotify().MarkShaderComplete(); + + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, shader_type), + std::move(program), true)); + } else { + // Required for entries + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + auto entries = MakeEntries(params.device, ir, shader_type); - ShaderDiskCacheEntry entry; - entry.type = shader_type; - entry.code = std::move(code); - entry.code_b = std::move(code_b); - entry.unique_identifier = params.unique_identifier; - entry.bound_buffer = registry->GetBoundBuffer(); - entry.graphics_info = registry->GetGraphicsInfo(); - entry.keys = registry->GetKeys(); - entry.bound_samplers = registry->GetBoundSamplers(); - entry.bindless_samplers = registry->GetBindlessSamplers(); - params.disk_cache.SaveEntry(std::move(entry)); + async_shaders.QueueOpenGLShader(params.device, shader_type, params.unique_identifier, + std::move(code), std::move(code_b), STAGE_MAIN_OFFSET, + COMPILER_SETTINGS, *registry, cpu_addr); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + auto program = std::make_shared<ProgramHandle>(); + return std::unique_ptr<Shader>( + new Shader(std::move(registry), std::move(entries), std::move(program), false)); + } } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { - const std::size_t size_in_bytes = code.size() * sizeof(u64); +std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code) { + auto& gpu = params.gpu; + gpu.ShaderNotify().MarkSharderBuilding(); - auto& engine = params.system.GPU().KeplerCompute(); - auto registry = std::make_shared<Registry>(ShaderType::Compute, engine); + auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine); const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); const u64 uid = params.unique_identifier; auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry); @@ -280,31 +305,43 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + gpu.ShaderNotify().MarkShaderComplete(); + + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), + std::move(program))); } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes) { - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, - precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader) { + return std::unique_ptr<Shader>(new Shader( + precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, - disk_cache{system} {} +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, + Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_) + : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_}, + gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, device{device_} {} -void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; + +void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { + disk_cache.BindTitleID(title_id); const std::optional transferable = disk_cache.LoadTransferable(); if (!transferable) { return; } - const std::vector gl_cache = disk_cache.LoadPrecompiled(); + std::vector<ShaderDiskCachePrecompiled> gl_cache; + if (!device.UseAssemblyShaders()) { + // Only load precompiled cache when we are not using assembly shaders + gl_cache = disk_cache.LoadPrecompiled(); + } const auto supported_formats = GetSupportedFormats(); // Track if precompiled cache was altered during loading to know if we have to @@ -343,7 +380,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, auto registry = MakeRegistry(entry); const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; if (precompiled_entry) { // If the shader is precompiled, attempt to load it with program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); @@ -359,7 +396,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, PrecompiledShader shader; shader.program = std::move(program); shader.registry = std::move(registry); - shader.entries = MakeEntries(ir); + shader.entries = MakeEntries(device, ir, entry.type); std::scoped_lock lock{mutex}; if (callback) { @@ -370,7 +407,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } }; - const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)}; + const std::size_t num_workers{std::max(1U, std::thread::hardware_concurrency())}; const std::size_t bucket_size{transferable->size() / num_workers}; std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers); std::vector<std::thread> threads(num_workers); @@ -397,6 +434,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, return; } + if (device.UseAssemblyShaders()) { + // Don't store precompiled binaries for assembly shaders. + return; + } + // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // before precompiling them @@ -404,7 +446,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const u64 id = (*transferable)[i].unique_identifier; const auto it = find_precompiled(id); if (it == gl_cache.end()) { - const GLuint program = runtime_cache.at(id).program->handle; + const GLuint program = runtime_cache.at(id).program->source_program.handle; disk_cache.SavePrecompiled(id, program); precompiled_cache_altered = true; } @@ -415,7 +457,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } } -std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( +ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats) { if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { @@ -423,15 +465,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( return {}; } - auto program = std::make_shared<OGLProgram>(); - program->handle = glCreateProgram(); - glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); - glProgramBinary(program->handle, precompiled_entry.binary_format, - precompiled_entry.binary.data(), + auto program = std::make_shared<ProgramHandle>(); + GLuint& handle = program->source_program.handle; + handle = glCreateProgram(); + glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE); + glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(), static_cast<GLsizei>(precompiled_entry.binary.size())); GLint link_status; - glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); + glGetProgramiv(handle, GL_LINK_STATUS, &link_status); if (link_status == GL_FALSE) { LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); return {}; @@ -440,77 +482,122 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { - return last_shaders[static_cast<std::size_t>(program)]; +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders) { + if (!maxwell3d.dirty.flags[Dirty::Shaders]) { + auto* last_shader = last_shaders[static_cast<std::size_t>(program)]; + if (last_shader->IsBuilt()) { + return last_shader; + } } - auto& memory_manager{system.GPU().MemoryManager()}; - const GPUVAddr address{GetShaderAddress(system, program)}; + const GPUVAddr address{GetShaderAddress(maxwell3d, program)}; + + if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) { + auto completed_work = async_shaders.GetCompletedWork(); + for (auto& work : completed_work) { + Shader* shader = TryGet(work.cpu_address); + gpu.ShaderNotify().MarkShaderComplete(); + if (shader == nullptr) { + continue; + } + using namespace VideoCommon::Shader; + if (work.backend == AsyncShaders::Backend::OpenGL) { + shader->AsyncOpenGLBuilt(std::move(work.program.opengl)); + } else if (work.backend == AsyncShaders::Backend::GLASM) { + shader->AsyncGLASMBuilt(std::move(work.program.glasm)); + } + + auto& registry = shader->GetRegistry(); + + ShaderDiskCacheEntry entry; + entry.type = work.shader_type; + entry.code = std::move(work.code); + entry.code_b = std::move(work.code_b); + entry.unique_identifier = work.uid; + entry.bound_buffer = registry.GetBoundBuffer(); + entry.graphics_info = registry.GetGraphicsInfo(); + entry.keys = registry.GetKeys(); + entry.bound_samplers = registry.GetBoundSamplers(); + entry.bindless_samplers = registry.GetBindlessSamplers(); + disk_cache.SaveEntry(std::move(entry)); + } + } // Look up shader in the cache based on address - const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; - Shader shader{cpu_addr ? TryGet(*cpu_addr) : nullptr}; - if (shader) { + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)}; + if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast<std::size_t>(program)] = shader; } - const auto host_ptr{memory_manager.GetPointer(address)}; + const u8* const host_ptr{gpu_memory.GetPointer(address)}; // No shader found - create a new one - ProgramCode code{GetShaderCode(memory_manager, address, host_ptr)}; + ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)}; ProgramCode code_b; if (program == Maxwell::ShaderProgram::VertexA) { - const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; - code_b = GetShaderCode(memory_manager, address_b, memory_manager.GetPointer(address_b)); + const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)}; + const u8* host_ptr_b = gpu_memory.GetPointer(address_b); + code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false); } + const std::size_t code_size = code.size() * sizeof(u64); - const auto unique_identifier = GetUniqueIdentifier( + const u64 unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, maxwell3d, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), - std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b), + async_shaders, cpu_addr.value_or(0)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + shader = Shader::CreateFromCache(params, found->second); } - Register(shader); - return last_shaders[static_cast<std::size_t>(program)] = shader; + Shader* const result = shader.get(); + if (cpu_addr) { + Register(std::move(shader), *cpu_addr, code_size); + } else { + null_shader = std::move(shader); + } + + return last_shaders[static_cast<std::size_t>(program)] = result; } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { - auto& memory_manager{system.GPU().MemoryManager()}; - const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)}; - auto kernel = cpu_addr ? TryGet(*cpu_addr) : nullptr; - if (kernel) { + if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } - const auto host_ptr{memory_manager.GetPointer(code_addr)}; // No kernel found, create a new one - auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; - const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; + const u8* host_ptr{gpu_memory.GetPointer(code_addr)}; + ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)}; + const std::size_t code_size{code.size() * sizeof(u64)}; + const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, kepler_compute, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> kernel; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + kernel = Shader::CreateKernelFromMemory(params, std::move(code)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + kernel = Shader::CreateFromCache(params, found->second); } - Register(kernel); - return kernel; + Shader* const result = kernel.get(); + if (cpu_addr) { + Register(std::move(kernel), *cpu_addr, code_size); + } else { + null_kernel = std::move(kernel); + } + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index c836df5bd..1708af06a 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,114 +18,143 @@ #include "common/common_types.h" #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" -namespace Core { -class System; +namespace Tegra { +class MemoryManager; } namespace Core::Frontend { class EmuWindow; } +namespace VideoCommon::Shader { +class AsyncShaders; +} + namespace OpenGL { -class CachedShader; class Device; class RasterizerOpenGL; -struct UnspecializedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; +struct ProgramHandle { + OGLProgram source_program; + OGLAssemblyProgram assembly_program; +}; +using ProgramSharedPtr = std::shared_ptr<ProgramHandle>; + struct PrecompiledShader { - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; }; struct ShaderParameters { - Core::System& system; + Tegra::GPU& gpu; + Tegra::Engines::ConstBufferEngineInterface& engine; ShaderDiskCacheOpenGL& disk_cache; const Device& device; VAddr cpu_addr; - u8* host_ptr; + const u8* host_ptr; u64 unique_identifier; }; -class CachedShader final : public RasterizerCacheObject { +ProgramSharedPtr BuildShader(const Device& device, Tegra::Engines::ShaderType shader_type, + u64 unique_identifier, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + bool hint_retrievable = false); + +class Shader final { public: - ~CachedShader(); + ~Shader(); /// Gets the GL program handle for the shader GLuint GetHandle() const; - /// Returns the size in bytes of the shader - std::size_t GetSizeInBytes() const override { - return size_in_bytes; - } + bool IsBuilt() const; /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + const VideoCommon::Shader::Registry& GetRegistry() const { + return *registry; + } + + /// Mark a OpenGL shader as built + void AsyncOpenGLBuilt(OGLProgram new_program); + + /// Mark a GLASM shader as built + void AsyncGLASMBuilt(OGLAssemblyProgram new_program); + + static std::unique_ptr<Shader> CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b, + VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr); + + static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code); - static Shader CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes); + static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader); private: - explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, std::shared_ptr<OGLProgram> program); + explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, + ProgramSharedPtr program, bool is_built = true); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; - std::size_t size_in_bytes = 0; - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; + GLuint handle = 0; + bool is_built{}; }; -class ShaderCacheOpenGL final : public RasterizerCache<Shader> { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window, + Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::KeplerCompute& kepler_compute, + Tegra::MemoryManager& gpu_memory, const Device& device); + ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game - void LoadDiskCache(const std::atomic_bool& stop_loading, + void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders); /// Gets a compute kernel in the passed address - Shader GetComputeKernel(GPUVAddr code_addr); - -protected: - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const Shader& object) override {} + Shader* GetComputeKernel(GPUVAddr code_addr); private: - std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( + ProgramSharedPtr GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats); - Core::System& system; Core::Frontend::EmuWindow& emu_window; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; const Device& device; + ShaderDiskCacheOpenGL disk_cache; std::unordered_map<u64, PrecompiledShader> runtime_cache; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; + + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index b1804e9ea..95ca96c8e 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; +using Tegra::Shader::TextureType; using VideoCommon::Shader::BuildTransformFeedback; using VideoCommon::Shader::Registry; @@ -61,8 +62,8 @@ struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; -constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); +constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint @@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { + const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); + // We waste one UBO for emulation + const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; + return num_ubos > num_available_ubos; +} + struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -412,8 +420,9 @@ class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, - identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, + suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ + UseUnifiedUniforms(device, ir, stage)} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -484,7 +493,7 @@ private: code.AddLine("switch (jmp_to) {{"); for (const auto& pair : ir.GetBasicBlocks()) { - const auto [address, bb] = pair; + const auto& [address, bb] = pair; code.AddLine("case 0x{:X}U: {{", address); ++code.scope; @@ -518,6 +527,9 @@ private: if (device.HasImageLoadFormatted()) { code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); } + if (device.HasTextureShadowLod()) { + code.AddLine("#extension GL_EXT_texture_shadow_lod : require"); + } if (device.HasWarpIntrinsics()) { code.AddLine("#extension GL_NV_gpu_shader5 : require"); code.AddLine("#extension GL_NV_shader_thread_group : require"); @@ -590,8 +602,15 @@ private: return; } const auto& info = registry.GetComputeInfo(); - if (const u32 size = info.shared_memory_size_in_words; size > 0) { - code.AddLine("shared uint smem[{}];", size); + if (u32 size = info.shared_memory_size_in_words * 4; size > 0) { + const u32 limit = device.GetMaxComputeSharedMemorySize(); + if (size > limit) { + LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}", + size, limit); + size = limit; + } + + code.AddLine("shared uint smem[{}];", size / 4); code.AddNewLine(); } code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;", @@ -618,7 +637,9 @@ private: break; } } - if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + + if (stage != ShaderType::Geometry && + (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } @@ -647,6 +668,16 @@ private: --code.scope; code.AddLine("}};"); code.AddNewLine(); + + if (stage == ShaderType::Geometry) { + if (ir.UsesLayer()) { + code.AddLine("out int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("out int gl_ViewportIndex;"); + } + } + code.AddNewLine(); } void DeclareRegisters() { @@ -782,7 +813,7 @@ private: const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); const auto it = transform_feedback.find(location); if (it == transform_feedback.end()) { - return {}; + return std::nullopt; } return it->second.components; } @@ -834,11 +865,24 @@ private: } void DeclareConstantBuffers() { + if (use_unified_uniforms) { + const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + + static_cast<u32>(ir.GetGlobalMemory().size()); + code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", + binding); + code.AddLine(" uint cbufs[];"); + code.AddLine("}};"); + code.AddNewLine(); + return; + } + u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto& [index, cbuf] : ir.GetConstantBuffers()) { + for (const auto [index, info] : ir.GetConstantBuffers()) { + const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; + const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, GetConstBufferBlock(index)); - code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); + code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); code.AddLine("}};"); code.AddNewLine(); } @@ -869,37 +913,37 @@ private: for (const auto& sampler : ir.GetSamplers()) { const std::string name = GetSampler(sampler); const std::string description = fmt::format("layout (binding = {}) uniform", binding); - binding += sampler.IsIndexed() ? sampler.Size() : 1; + binding += sampler.is_indexed ? sampler.size : 1; std::string sampler_type = [&]() { - if (sampler.IsBuffer()) { + if (sampler.is_buffer) { return "samplerBuffer"; } - switch (sampler.GetType()) { - case Tegra::Shader::TextureType::Texture1D: + switch (sampler.type) { + case TextureType::Texture1D: return "sampler1D"; - case Tegra::Shader::TextureType::Texture2D: + case TextureType::Texture2D: return "sampler2D"; - case Tegra::Shader::TextureType::Texture3D: + case TextureType::Texture3D: return "sampler3D"; - case Tegra::Shader::TextureType::TextureCube: + case TextureType::TextureCube: return "samplerCube"; default: UNREACHABLE(); return "sampler2D"; } }(); - if (sampler.IsArray()) { + if (sampler.is_array) { sampler_type += "Array"; } - if (sampler.IsShadow()) { + if (sampler.is_shadow) { sampler_type += "Shadow"; } - if (!sampler.IsIndexed()) { + if (!sampler.is_indexed) { code.AddLine("{} {} {};", description, sampler_type, name); } else { - code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size()); + code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.size); } } if (!ir.GetSamplers().empty()) { @@ -945,14 +989,14 @@ private: u32 binding = device.GetBaseBindings(stage).image; for (const auto& image : ir.GetImages()) { std::string qualifier = "coherent volatile"; - if (image.IsRead() && !image.IsWritten()) { + if (image.is_read && !image.is_written) { qualifier += " readonly"; - } else if (image.IsWritten() && !image.IsRead()) { + } else if (image.is_written && !image.is_read) { qualifier += " writeonly"; } - const char* format = image.IsAtomic() ? "r32ui, " : ""; - const char* type_declaration = GetImageTypeDeclaration(image.GetType()); + const char* format = image.is_atomic ? "r32ui, " : ""; + const char* type_declaration = GetImageTypeDeclaration(image.type); code.AddLine("layout ({}binding = {}) {} uniform uimage{} {};", format, binding++, qualifier, type_declaration, GetImage(image)); } @@ -1037,42 +1081,51 @@ private: if (const auto cbuf = std::get_if<CbufNode>(&*node)) { const Node offset = cbuf->GetOffset(); + const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; + if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; + if (use_unified_uniforms) { + return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), + Type::Uint}; + } else { + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), + Type::Uint}; + } } - if (std::holds_alternative<OperationNode>(*offset)) { - // Indirect access - const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); + // Indirect access + if (use_unified_uniforms) { + return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, + Visit(offset).AsUint()), + Type::Uint}; + } - if (!device.HasComponentIndexingBug()) { - return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset), - Type::Uint}; - } + const std::string final_offset = code.GenerateTemporary(); + code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); - // AMD's proprietary GLSL compiler emits ill code for variable component access. - // To bypass this driver bug generate 4 ifs, one per each component. - const std::string pack = code.GenerateTemporary(); - code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), - final_offset); - - const std::string result = code.GenerateTemporary(); - code.AddLine("uint {};", result); - for (u32 swizzle = 0; swizzle < 4; ++swizzle) { - code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, - pack, GetSwizzle(swizzle)); - } - return {result, Type::Uint}; + if (!device.HasComponentIndexingBug()) { + return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset), + Type::Uint}; } - UNREACHABLE_MSG("Unmanaged offset node type"); + // AMD's proprietary GLSL compiler emits ill code for variable component access. + // To bypass this driver bug generate 4 ifs, one per each component. + const std::string pack = code.GenerateTemporary(); + code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + final_offset); + + const std::string result = code.GenerateTemporary(); + code.AddLine("uint {};", result); + for (u32 swizzle = 0; swizzle < 4; ++swizzle) { + code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, + GetSwizzle(swizzle)); + } + return {result, Type::Uint}; } if (const auto gmem = std::get_if<GmemNode>(&*node)) { @@ -1144,6 +1197,7 @@ private: return {"gl_FragCoord"s + GetSwizzle(element), Type::Float}; default: UNREACHABLE(); + return {"0", Type::Int}; } case Attribute::Index::FrontColor: return {"gl_Color"s + GetSwizzle(element), Type::Float}; @@ -1241,21 +1295,21 @@ private: switch (element) { case 0: UNIMPLEMENTED(); - return {}; + return std::nullopt; case 1: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_Layer", Type::Int}}; case 2: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_ViewportIndex", Type::Int}}; case 3: return {{"gl_PointSize", Type::Float}}; } - return {}; + return std::nullopt; case Attribute::Index::FrontColor: return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}}; case Attribute::Index::FrontSecondaryColor: @@ -1278,7 +1332,7 @@ private: Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); - return {}; + return std::nullopt; } } @@ -1335,16 +1389,27 @@ private: ASSERT(meta); const std::size_t count = operation.GetOperandsCount(); - const bool has_array = meta->sampler.IsArray(); - const bool has_shadow = meta->sampler.IsShadow(); + const bool has_array = meta->sampler.is_array; + const bool has_shadow = meta->sampler.is_shadow; + const bool workaround_lod_array_shadow_as_grad = + !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube); + + std::string expr = "texture"; + + if (workaround_lod_array_shadow_as_grad) { + expr += "Grad"; + } else { + expr += function_suffix; + } - std::string expr = "texture" + function_suffix; if (!meta->aoffi.empty()) { expr += "Offset"; } else if (!meta->ptp.empty()) { expr += "Offsets"; } - if (!meta->sampler.IsIndexed()) { + if (!meta->sampler.is_indexed) { expr += '(' + GetSampler(meta->sampler) + ", "; } else { expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], "; @@ -1372,6 +1437,18 @@ private: expr += ')'; } + if (workaround_lod_array_shadow_as_grad) { + switch (meta->sampler.type) { + case TextureType::Texture2D: + return expr + ", vec2(0.0), vec2(0.0))"; + case TextureType::TextureCube: + return expr + ", vec3(0.0), vec3(0.0))"; + default: + UNREACHABLE(); + break; + } + } + for (const auto& variant : extras) { if (const auto argument = std::get_if<TextureArgument>(&variant)) { expr += GenerateTextureArgument(*argument); @@ -1482,8 +1559,8 @@ private: dy += '('; for (std::size_t index = 0; index < components; ++index) { - const auto operand_x{derivates.at(index * 2)}; - const auto operand_y{derivates.at(index * 2 + 1)}; + const auto& operand_x{derivates.at(index * 2)}; + const auto& operand_y{derivates.at(index * 2 + 1)}; dx += Visit(operand_x).AsFloat(); dy += Visit(operand_y).AsFloat(); @@ -1536,7 +1613,9 @@ private: Expression target; if (const auto gpr = std::get_if<GprNode>(&*dest)) { if (gpr->GetIndex() == Register::ZeroIndex) { - // Writing to Register::ZeroIndex is a no op + // Writing to Register::ZeroIndex is a no op but we still have to visit the source + // as it might have side effects. + code.AddLine("{};", Visit(src).GetCode()); return {}; } target = {GetRegister(gpr->GetIndex()), Type::Float}; @@ -1838,38 +1917,48 @@ private: Type::HalfFloat}; } - template <Type type> - Expression LogicalLessThan(Operation operation) { - return GenerateBinaryInfix(operation, "<", Type::Bool, type, type); - } - - template <Type type> - Expression LogicalEqual(Operation operation) { - return GenerateBinaryInfix(operation, "==", Type::Bool, type, type); - } + template <const std::string_view& op, Type type, bool unordered = false> + Expression Comparison(Operation operation) { + static_assert(!unordered || type == Type::Float); - template <Type type> - Expression LogicalLessEqual(Operation operation) { - return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type); - } + Expression expr = GenerateBinaryInfix(operation, op, Type::Bool, type, type); - template <Type type> - Expression LogicalGreaterThan(Operation operation) { - return GenerateBinaryInfix(operation, ">", Type::Bool, type, type); + if constexpr (op.compare("!=") == 0 && type == Type::Float && !unordered) { + // GLSL's operator!=(float, float) doesn't seem be ordered. This happens on both AMD's + // and Nvidia's proprietary stacks. Manually force an ordered comparison. + return {fmt::format("({} && !isnan({}) && !isnan({}))", expr.AsBool(), + VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; + } + if constexpr (!unordered) { + return expr; + } + // Unordered comparisons are always true for NaN operands. + return {fmt::format("({} || isnan({}) || isnan({}))", expr.AsBool(), + VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - template <Type type> - Expression LogicalNotEqual(Operation operation) { - return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type); + Expression FOrdered(Operation operation) { + return {fmt::format("(!isnan({}) && !isnan({}))", VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - template <Type type> - Expression LogicalGreaterEqual(Operation operation) { - return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type); + Expression FUnordered(Operation operation) { + return {fmt::format("(isnan({}) || isnan({}))", VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - Expression LogicalFIsNan(Operation operation) { - return GenerateUnary(operation, "isnan", Type::Bool, Type::Float); + Expression LogicalAddCarry(Operation operation) { + const std::string carry = code.GenerateTemporary(); + code.AddLine("uint {};", carry); + code.AddLine("uaddCarry({}, {}, {});", VisitOperand(operation, 0).AsUint(), + VisitOperand(operation, 1).AsUint(), carry); + return {fmt::format("({} != 0)", carry), Type::Bool}; } Expression LogicalAssign(Operation operation) { @@ -1967,24 +2056,39 @@ private: } Expression Texture(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - - std::string expr = GenerateTexture( - operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}}); - if (meta->sampler.IsShadow()) { - expr = "vec4(" + expr + ')'; + const auto meta = std::get<MetaTexture>(operation.GetMeta()); + const bool separate_dc = meta.sampler.type == TextureType::TextureCube && + meta.sampler.is_array && meta.sampler.is_shadow; + // TODO: Replace this with an array and make GenerateTexture use C++20 std::span + const std::vector<TextureIR> extras{ + TextureOffset{}, + TextureArgument{Type::Float, meta.bias}, + }; + std::string expr = GenerateTexture(operation, "", extras, separate_dc); + if (meta.sampler.is_shadow) { + expr = fmt::format("vec4({})", expr); } - return {expr + GetSwizzle(meta->element), Type::Float}; + return {expr + GetSwizzle(meta.element), Type::Float}; } Expression TextureLod(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture( - operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); - if (meta->sampler.IsShadow()) { + std::string expr{}; + + if (!device.HasTextureShadowLod() && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube)) { + LOG_ERROR(Render_OpenGL, + "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround"); + expr = GenerateTexture(operation, "Lod", {}); + } else { + expr = GenerateTexture(operation, "Lod", + {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + } + + if (meta->sampler.is_shadow) { expr = "vec4(" + expr + ')'; } return {expr + GetSwizzle(meta->element), Type::Float}; @@ -1993,11 +2097,11 @@ private: Expression TextureGather(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); - const auto type = meta.sampler.IsShadow() ? Type::Float : Type::Int; - const bool separate_dc = meta.sampler.IsShadow(); + const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int; + const bool separate_dc = meta.sampler.is_shadow; std::vector<TextureIR> ir; - if (meta.sampler.IsShadow()) { + if (meta.sampler.is_shadow) { ir = {TextureOffset{}}; } else { ir = {TextureOffset{}, TextureArgument{type, meta.component}}; @@ -2042,7 +2146,7 @@ private: constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"}; const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - UNIMPLEMENTED_IF(meta->sampler.IsArray()); + UNIMPLEMENTED_IF(meta->sampler.is_array); const std::size_t count = operation.GetOperandsCount(); std::string expr = "texelFetch("; @@ -2063,7 +2167,7 @@ private: } expr += ')'; - if (meta->lod && !meta->sampler.IsBuffer()) { + if (meta->lod && !meta->sampler.is_buffer) { expr += ", "; expr += Visit(meta->lod).AsInt(); } @@ -2074,12 +2178,10 @@ private: } Expression TextureGradient(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); std::string expr = GenerateTexture(operation, "Grad", {TextureDerivates{}, TextureOffset{}}); - return {std::move(expr) + GetSwizzle(meta->element), Type::Float}; + return {std::move(expr) + GetSwizzle(meta.element), Type::Float}; } Expression ImageLoad(Operation operation) { @@ -2295,6 +2397,18 @@ private: return {"gl_SubGroupInvocationARB", Type::Uint}; } + template <const std::string_view& comparison> + Expression ThreadMask(Operation) { + if (device.HasWarpIntrinsics()) { + return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint}; + } + if (device.HasShaderBallot()) { + return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint}; + } + LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader"); + return {"0U", Type::Uint}; + } + Expression ShuffleIndexed(Operation operation) { std::string value = VisitOperand(operation, 0).AsFloat(); @@ -2307,7 +2421,21 @@ private: return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float}; } - Expression MemoryBarrierGL(Operation) { + Expression Barrier(Operation) { + if (!ir.IsDecompiled()) { + LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled"); + return {}; + } + code.AddLine("barrier();"); + return {}; + } + + Expression MemoryBarrierGroup(Operation) { + code.AddLine("groupMemoryBarrier();"); + return {}; + } + + Expression MemoryBarrierGlobal(Operation) { code.AddLine("memoryBarrier();"); return {}; } @@ -2316,6 +2444,19 @@ private: Func() = delete; ~Func() = delete; + static constexpr std::string_view LessThan = "<"; + static constexpr std::string_view Equal = "=="; + static constexpr std::string_view LessEqual = "<="; + static constexpr std::string_view GreaterThan = ">"; + static constexpr std::string_view NotEqual = "!="; + static constexpr std::string_view GreaterEqual = ">="; + + static constexpr std::string_view Eq = "Eq"; + static constexpr std::string_view Ge = "Ge"; + static constexpr std::string_view Gt = "Gt"; + static constexpr std::string_view Le = "Le"; + static constexpr std::string_view Lt = "Lt"; + static constexpr std::string_view Add = "Add"; static constexpr std::string_view Min = "Min"; static constexpr std::string_view Max = "Max"; @@ -2417,27 +2558,36 @@ private: &GLSLDecompiler::LogicalPick2, &GLSLDecompiler::LogicalAnd2, - &GLSLDecompiler::LogicalLessThan<Type::Float>, - &GLSLDecompiler::LogicalEqual<Type::Float>, - &GLSLDecompiler::LogicalLessEqual<Type::Float>, - &GLSLDecompiler::LogicalGreaterThan<Type::Float>, - &GLSLDecompiler::LogicalNotEqual<Type::Float>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Float>, - &GLSLDecompiler::LogicalFIsNan, - - &GLSLDecompiler::LogicalLessThan<Type::Int>, - &GLSLDecompiler::LogicalEqual<Type::Int>, - &GLSLDecompiler::LogicalLessEqual<Type::Int>, - &GLSLDecompiler::LogicalGreaterThan<Type::Int>, - &GLSLDecompiler::LogicalNotEqual<Type::Int>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Int>, - - &GLSLDecompiler::LogicalLessThan<Type::Uint>, - &GLSLDecompiler::LogicalEqual<Type::Uint>, - &GLSLDecompiler::LogicalLessEqual<Type::Uint>, - &GLSLDecompiler::LogicalGreaterThan<Type::Uint>, - &GLSLDecompiler::LogicalNotEqual<Type::Uint>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>, + &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, false>, + &GLSLDecompiler::FOrdered, + &GLSLDecompiler::FUnordered, + &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, true>, + + &GLSLDecompiler::Comparison<Func::LessThan, Type::Int>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Int>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Int>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Int>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Int>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Int>, + + &GLSLDecompiler::Comparison<Func::LessThan, Type::Uint>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Uint>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Uint>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Uint>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Uint>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Uint>, + + &GLSLDecompiler::LogicalAddCarry, &GLSLDecompiler::Logical2HLessThan<false>, &GLSLDecompiler::Logical2HEqual<false>, @@ -2524,9 +2674,16 @@ private: &GLSLDecompiler::VoteEqual, &GLSLDecompiler::ThreadId, + &GLSLDecompiler::ThreadMask<Func::Eq>, + &GLSLDecompiler::ThreadMask<Func::Ge>, + &GLSLDecompiler::ThreadMask<Func::Gt>, + &GLSLDecompiler::ThreadMask<Func::Le>, + &GLSLDecompiler::ThreadMask<Func::Lt>, &GLSLDecompiler::ShuffleIndexed, - &GLSLDecompiler::MemoryBarrierGL, + &GLSLDecompiler::Barrier, + &GLSLDecompiler::MemoryBarrierGroup, + &GLSLDecompiler::MemoryBarrierGlobal, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); @@ -2596,11 +2753,11 @@ private: } std::string GetSampler(const Sampler& sampler) const { - return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); + return AppendSuffix(sampler.index, "sampler"); } std::string GetImage(const Image& image) const { - return AppendSuffix(static_cast<u32>(image.GetIndex()), "image"); + return AppendSuffix(image.index, "image"); } std::string AppendSuffix(u32 index, std::string_view name) const { @@ -2623,15 +2780,6 @@ private: return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); } - bool IsRenderTargetEnabled(u32 render_target) const { - for (u32 component = 0; component < 4; ++component) { - if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { - return true; - } - } - return false; - } - const Device& device; const ShaderIR& ir; const Registry& registry; @@ -2639,6 +2787,7 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; + const bool use_unified_uniforms; std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; @@ -2834,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2855,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } entries.shader_length = ir.GetLength(); + entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e7dbd810c..451c9689a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -33,36 +33,19 @@ public: } private: - u32 index{}; + u32 index = 0; }; -class GlobalMemoryEntry { -public: - explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written) +struct GlobalMemoryEntry { + constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, + bool is_written) : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ is_written} {} - u32 GetCbufIndex() const { - return cbuf_index; - } - - u32 GetCbufOffset() const { - return cbuf_offset; - } - - bool IsRead() const { - return is_read; - } - - bool IsWritten() const { - return is_written; - } - -private: - u32 cbuf_index{}; - u32 cbuf_offset{}; - bool is_read{}; - bool is_written{}; + u32 cbuf_index = 0; + u32 cbuf_offset = 0; + bool is_read = false; + bool is_written = false; }; struct ShaderEntries { @@ -70,11 +53,13 @@ struct ShaderEntries { std::vector<GlobalMemoryEntry> global_memory_entries; std::vector<SamplerEntry> samplers; std::vector<ImageEntry> images; - u32 clip_distances{}; std::size_t shader_length{}; + u32 clip_distances{}; + bool use_unified_uniforms{}; }; -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); +ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Tegra::Engines::ShaderType stage); std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, const VideoCommon::Shader::Registry& registry, diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..70dd0c3c6 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array<u8, 64>; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -63,7 +73,7 @@ ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default; ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default; -bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { +bool ShaderDiskCacheEntry::Load(Common::FS::IOFile& file) { if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { return false; } @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,29 +113,38 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector<ConstBufferKey> flat_keys(num_keys); - std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers); + std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers); + std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); + } + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; } -bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { +bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const { if (file.WriteObject(static_cast<u32>(type)) != 1 || file.WriteObject(static_cast<u32>(code.size())) != 1 || file.WriteObject(static_cast<u32>(code_b.size())) != 1) { @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast<u32>(keys.size())) != 1 || file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 || file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { return false; } @@ -152,48 +174,64 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector<BoundSamplerKey> flat_bound_samplers; + std::vector<BoundSamplerEntry> flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); } - std::vector<BindlessSamplerKey> flat_bindless_samplers; + std::vector<SeparateSamplerEntry> flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); + } + + std::vector<BindlessSamplerEntry> flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } -ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} +ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default; ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; +void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) { + title_id = title_id_; +} + std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() { // Skip games without title id - const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; - if (!Settings::values.use_disk_shader_cache || !has_title_id) { - return {}; + const bool has_title_id = title_id != 0; + if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) { + return std::nullopt; } - FileUtil::IOFile file(GetTransferablePath(), "rb"); + Common::FS::IOFile file(GetTransferablePath(), "rb"); if (!file.IsOpen()) { LOG_INFO(Render_OpenGL, "No transferable shader cache found"); is_usable = true; - return {}; + return std::nullopt; } u32 version{}; if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it"); - return {}; + return std::nullopt; } if (version < NativeVersion) { @@ -201,12 +239,12 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran file.Close(); InvalidateTransferable(); is_usable = true; - return {}; + return std::nullopt; } if (version > NativeVersion) { LOG_WARNING(Render_OpenGL, "Transferable shader cache was generated with a newer version " "of the emulator, skipping"); - return {}; + return std::nullopt; } // Version is valid, load the shaders @@ -215,7 +253,7 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran ShaderDiskCacheEntry& entry = entries.emplace_back(); if (!entry.Load(file)) { LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping"); - return {}; + return std::nullopt; } } @@ -228,7 +266,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() return {}; } - FileUtil::IOFile file(GetPrecompiledPath(), "rb"); + Common::FS::IOFile file(GetPrecompiledPath(), "rb"); if (!file.IsOpen()) { LOG_INFO(Render_OpenGL, "No precompiled shader cache found"); return {}; @@ -245,7 +283,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() } std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile( - FileUtil::IOFile& file) { + Common::FS::IOFile& file) { // Read compressed file from disk and decompress to virtual precompiled cache file std::vector<u8> compressed(file.GetSize()); file.ReadBytes(compressed.data(), compressed.size()); @@ -256,12 +294,12 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo ShaderCacheVersionHash file_hash{}; if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) { precompiled_cache_virtual_file_offset = 0; - return {}; + return std::nullopt; } if (GetShaderCacheVersionHash() != file_hash) { LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator"); precompiled_cache_virtual_file_offset = 0; - return {}; + return std::nullopt; } std::vector<ShaderDiskCachePrecompiled> entries; @@ -271,19 +309,19 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo if (!LoadObjectFromPrecompiled(entry.unique_identifier) || !LoadObjectFromPrecompiled(entry.binary_format) || !LoadObjectFromPrecompiled(binary_size)) { - return {}; + return std::nullopt; } entry.binary.resize(binary_size); if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) { - return {}; + return std::nullopt; } } return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { - if (!FileUtil::Delete(GetTransferablePath())) { + if (!Common::FS::Delete(GetTransferablePath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}", GetTransferablePath()); } @@ -294,7 +332,7 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() { // Clear virtaul precompiled cache file precompiled_cache_virtual_file.Resize(0); - if (!FileUtil::Delete(GetPrecompiledPath())) { + if (!Common::FS::Delete(GetPrecompiledPath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath()); } } @@ -310,7 +348,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { return; } - FileUtil::IOFile file = AppendTransferableFile(); + Common::FS::IOFile file = AppendTransferableFile(); if (!file.IsOpen()) { return; } @@ -352,15 +390,15 @@ void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint progra } } -FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const { +Common::FS::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const { if (!EnsureDirectories()) { return {}; } const auto transferable_path{GetTransferablePath()}; - const bool existed = FileUtil::Exists(transferable_path); + const bool existed = Common::FS::Exists(transferable_path); - FileUtil::IOFile file(transferable_path, "ab"); + Common::FS::IOFile file(transferable_path, "ab"); if (!file.IsOpen()) { LOG_ERROR(Render_OpenGL, "Failed to open transferable cache in path={}", transferable_path); return {}; @@ -392,7 +430,7 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size()); const auto precompiled_path{GetPrecompiledPath()}; - FileUtil::IOFile file(precompiled_path, "wb"); + Common::FS::IOFile file(precompiled_path, "wb"); if (!file.IsOpen()) { LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path); @@ -406,24 +444,24 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { bool ShaderDiskCacheOpenGL::EnsureDirectories() const { const auto CreateDir = [](const std::string& dir) { - if (!FileUtil::CreateDir(dir)) { + if (!Common::FS::CreateDir(dir)) { LOG_ERROR(Render_OpenGL, "Failed to create directory={}", dir); return false; } return true; }; - return CreateDir(FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir)) && + return CreateDir(Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir)) && CreateDir(GetBaseDir()) && CreateDir(GetTransferableDir()) && CreateDir(GetPrecompiledDir()); } std::string ShaderDiskCacheOpenGL::GetTransferablePath() const { - return FileUtil::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); + return Common::FS::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); } std::string ShaderDiskCacheOpenGL::GetPrecompiledPath() const { - return FileUtil::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); + return Common::FS::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); } std::string ShaderDiskCacheOpenGL::GetTransferableDir() const { @@ -435,11 +473,11 @@ std::string ShaderDiskCacheOpenGL::GetPrecompiledDir() const { } std::string ShaderDiskCacheOpenGL::GetBaseDir() const { - return FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir) + DIR_SEP "opengl"; + return Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir) + DIR_SEP "opengl"; } std::string ShaderDiskCacheOpenGL::GetTitleID() const { - return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID()); + return fmt::format("{:016X}", title_id); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..aef841c1d 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -21,11 +21,7 @@ #include "video_core/engines/shader_type.h" #include "video_core/shader/registry.h" -namespace Core { -class System; -} - -namespace FileUtil { +namespace Common::FS { class IOFile; } @@ -38,9 +34,9 @@ struct ShaderDiskCacheEntry { ShaderDiskCacheEntry(); ~ShaderDiskCacheEntry(); - bool Load(FileUtil::IOFile& file); + bool Load(Common::FS::IOFile& file); - bool Save(FileUtil::IOFile& file) const; + bool Save(Common::FS::IOFile& file) const; bool HasProgramA() const { return !code.empty() && !code_b.empty(); @@ -57,6 +53,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; @@ -69,9 +66,12 @@ struct ShaderDiskCachePrecompiled { class ShaderDiskCacheOpenGL { public: - explicit ShaderDiskCacheOpenGL(Core::System& system); + explicit ShaderDiskCacheOpenGL(); ~ShaderDiskCacheOpenGL(); + /// Binds a title ID for all future operations. + void BindTitleID(u64 title_id); + /// Loads transferable cache. If file has a old version or on failure, it deletes the file. std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable(); @@ -96,10 +96,10 @@ public: private: /// Loads the transferable cache. Returns empty on failure. std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile( - FileUtil::IOFile& file); + Common::FS::IOFile& file); /// Opens current game's transferable file and write it's header if it doesn't exist - FileUtil::IOFile AppendTransferableFile() const; + Common::FS::IOFile AppendTransferableFile() const; /// Save precompiled header to precompiled_cache_in_memory void SavePrecompiledHeaderToVirtualPrecompiledCache(); @@ -156,8 +156,6 @@ private: return LoadArrayFromPrecompiled(&object, 1); } - Core::System& system; - // Stores whole precompiled cache which will be read from or saved to the precompiled chache // file FileSys::VectorVfsFile precompiled_cache_virtual_file; @@ -167,8 +165,11 @@ private: // Stored transferable shaders std::unordered_set<u64> stored_transferable; + /// Title ID to operate on + u64 title_id = 0; + // The cache has been loaded at boot - bool is_usable{}; + bool is_usable = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 9c7b0adbd..691c6c79b 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -6,47 +6,124 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -namespace OpenGL::GLShader { +namespace OpenGL { -ProgramManager::ProgramManager() = default; +namespace { + +void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) { + if (current == old) { + return; + } + if (current == 0) { + if (enabled) { + enabled = false; + glDisable(stage); + } + return; + } + if (!enabled) { + enabled = true; + glEnable(stage); + } + glBindProgramARB(stage, current); +} + +} // Anonymous namespace + +ProgramManager::ProgramManager(const Device& device) + : use_assembly_programs{device.UseAssemblyShaders()} { + if (use_assembly_programs) { + glEnable(GL_COMPUTE_PROGRAM_NV); + } else { + graphics_pipeline.Create(); + glBindProgramPipeline(graphics_pipeline.handle); + } +} ProgramManager::~ProgramManager() = default; -void ProgramManager::Create() { - graphics_pipeline.Create(); - glBindProgramPipeline(graphics_pipeline.handle); +void ProgramManager::BindCompute(GLuint program) { + if (use_assembly_programs) { + glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program); + } else { + is_graphics_bound = false; + glUseProgram(program); + } } void ProgramManager::BindGraphicsPipeline() { + if (!use_assembly_programs) { + UpdateSourcePrograms(); + } +} + +void ProgramManager::BindHostPipeline(GLuint pipeline) { + if (use_assembly_programs) { + if (geometry_enabled) { + geometry_enabled = false; + old_state.geometry = 0; + glDisable(GL_GEOMETRY_PROGRAM_NV); + } + } else { + if (!is_graphics_bound) { + glUseProgram(0); + } + } + glBindProgramPipeline(pipeline); +} + +void ProgramManager::RestoreGuestPipeline() { + if (use_assembly_programs) { + glBindProgramPipeline(0); + } else { + glBindProgramPipeline(graphics_pipeline.handle); + } +} + +void ProgramManager::UseVertexShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled); + } + current_state.vertex = program; +} + +void ProgramManager::UseGeometryShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled); + } + current_state.geometry = program; +} + +void ProgramManager::UseFragmentShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled); + } + current_state.fragment = program; +} + +void ProgramManager::UpdateSourcePrograms() { if (!is_graphics_bound) { is_graphics_bound = true; glUseProgram(0); } - // Avoid updating the pipeline when values have no changed - if (old_state == current_state) { - return; - } - - // Workaround for AMD bug - static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | - GL_FRAGMENT_SHADER_BIT}; const GLuint handle = graphics_pipeline.handle; - glUseProgramStages(handle, all_used_stages, 0); - glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); - glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); - glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); + const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) { + if (current == old) { + return; + } + glUseProgramStages(handle, stage, current); + }; + update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex); + update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry); + update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment); old_state = current_state; } -void ProgramManager::BindComputeShader(GLuint program) { - is_graphics_bound = false; - glUseProgram(program); -} - void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { const auto& regs = maxwell.regs; @@ -54,4 +131,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index d2e47f2a9..950e0dfcb 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -11,7 +11,9 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" -namespace OpenGL::GLShader { +namespace OpenGL { + +class Device; /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at @@ -28,50 +30,47 @@ static_assert(sizeof(MaxwellUniformData) < 16384, class ProgramManager { public: - explicit ProgramManager(); + explicit ProgramManager(const Device& device); ~ProgramManager(); - void Create(); + /// Binds a compute program + void BindCompute(GLuint program); - /// Updates the graphics pipeline and binds it. + /// Updates bound programs. void BindGraphicsPipeline(); - /// Binds a compute shader. - void BindComputeShader(GLuint program); - - void UseVertexShader(GLuint program) { - current_state.vertex_shader = program; - } + /// Binds an OpenGL pipeline object unsynchronized with the guest state. + void BindHostPipeline(GLuint pipeline); - void UseGeometryShader(GLuint program) { - current_state.geometry_shader = program; - } + /// Rewinds BindHostPipeline state changes. + void RestoreGuestPipeline(); - void UseFragmentShader(GLuint program) { - current_state.fragment_shader = program; - } + void UseVertexShader(GLuint program); + void UseGeometryShader(GLuint program); + void UseFragmentShader(GLuint program); private: struct PipelineState { - bool operator==(const PipelineState& rhs) const noexcept { - return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && - geometry_shader == rhs.geometry_shader; - } - - bool operator!=(const PipelineState& rhs) const noexcept { - return !operator==(rhs); - } - - GLuint vertex_shader = 0; - GLuint fragment_shader = 0; - GLuint geometry_shader = 0; + GLuint vertex = 0; + GLuint geometry = 0; + GLuint fragment = 0; }; + /// Update GLSL programs. + void UpdateSourcePrograms(); + OGLPipeline graphics_pipeline; - OGLPipeline compute_pipeline; + PipelineState current_state; PipelineState old_state; + + bool use_assembly_programs = false; + bool is_graphics_bound = true; + + bool vertex_enabled = false; + bool geometry_enabled = false; + bool fragment_enabled = false; }; -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 9e74eda0d..4bf0d6090 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <string_view> #include <vector> #include <glad/glad.h> #include "common/assert.h" @@ -11,7 +12,8 @@ namespace OpenGL::GLShader { namespace { -const char* GetStageDebugName(GLenum type) { + +std::string_view StageDebugName(GLenum type) { switch (type) { case GL_VERTEX_SHADER: return "vertex"; @@ -25,12 +27,17 @@ const char* GetStageDebugName(GLenum type) { UNIMPLEMENTED(); return "unknown"; } + } // Anonymous namespace -GLuint LoadShader(const char* source, GLenum type) { - const char* debug_type = GetStageDebugName(type); +GLuint LoadShader(std::string_view source, GLenum type) { + const std::string_view debug_type = StageDebugName(type); const GLuint shader_id = glCreateShader(type); - glShaderSource(shader_id, 1, &source, nullptr); + + const GLchar* source_string = source.data(); + const GLint source_length = static_cast<GLint>(source.size()); + + glShaderSource(shader_id, 1, &source_string, &source_length); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); glCompileShader(shader_id); diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h index 03b7548c2..1b770532e 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.h +++ b/src/video_core/renderer_opengl/gl_shader_util.h @@ -38,7 +38,7 @@ void LogShaderSource(T... shaders) { * @param source String of the GLSL shader program * @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER) */ -GLuint LoadShader(const char* source, GLenum type); +GLuint LoadShader(std::string_view source, GLenum type); /** * Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader) diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index d24fad3de..6bcf831f2 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -214,10 +214,8 @@ void SetupDirtyMisc(Tables& tables) { } // Anonymous namespace -StateTracker::StateTracker(Core::System& system) : system{system} {} - -void StateTracker::Initialize() { - auto& dirty = system.GPU().Maxwell3D().dirty; +StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { + auto& dirty = gpu.Maxwell3D().dirty; auto& tables = dirty.tables; SetupDirtyRenderTargets(tables); SetupDirtyColorMasks(tables); diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 0f823288e..9d127548f 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -13,8 +13,8 @@ #include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" -namespace Core { -class System; +namespace Tegra { +class GPU; } namespace OpenGL { @@ -90,9 +90,7 @@ static_assert(Last <= std::numeric_limits<u8>::max()); class StateTracker { public: - explicit StateTracker(Core::System& system); - - void Initialize(); + explicit StateTracker(Tegra::GPU& gpu); void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { @@ -103,7 +101,6 @@ public: } void NotifyScreenDrawVertexArray() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true; @@ -117,98 +114,81 @@ public: } void NotifyPolygonModes() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonModes] = true; flags[OpenGL::Dirty::PolygonModeFront] = true; flags[OpenGL::Dirty::PolygonModeBack] = true; } void NotifyViewport0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Viewports] = true; flags[OpenGL::Dirty::Viewport0] = true; } void NotifyScissor0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Scissors] = true; flags[OpenGL::Dirty::Scissor0] = true; } void NotifyColorMask0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::ColorMasks] = true; flags[OpenGL::Dirty::ColorMask0] = true; } void NotifyBlend0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::BlendStates] = true; flags[OpenGL::Dirty::BlendState0] = true; } void NotifyFramebuffer() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[VideoCommon::Dirty::RenderTargets] = true; } void NotifyFrontFace() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FrontFace] = true; } void NotifyCullTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::CullTest] = true; } void NotifyDepthMask() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthMask] = true; } void NotifyDepthTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthTest] = true; } void NotifyStencilTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::StencilTest] = true; } void NotifyPolygonOffset() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonOffset] = true; } void NotifyRasterizeEnable() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::RasterizeEnable] = true; } void NotifyFramebufferSRGB() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FramebufferSRGB] = true; } void NotifyLogicOp() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::LogicOp] = true; } void NotifyClipControl() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::ClipControl] = true; } void NotifyAlphaTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::AlphaTest] = true; } private: - Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; GLuint index_buffer = 0; }; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 6ec328c53..887995cf4 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -2,11 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <deque> +#include <tuple> #include <vector> + #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, - bool use_persistent) +OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) { gl_buffer.Create(); @@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - if (use_persistent) { - persistent = true; - coherent = prefer_coherent; - const GLbitfield flags = - GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast<u8*>(glMapNamedBufferRange( - gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); - } else { - glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW); + static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; + glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); + mapped_ptr = static_cast<u8*>( + glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); } } OGLStreamBuffer::~OGLStreamBuffer() { - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } + glUnmapNamedBuffer(gl_buffer.handle); gl_buffer.Release(); } -GLuint OGLStreamBuffer::GetHandle() const { - return gl_buffer.handle; -} - -GLsizeiptr OGLStreamBuffer::GetSize() const { - return buffer_size; -} - std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { ASSERT(size <= buffer_size); ASSERT(alignment <= buffer_size); @@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a bool invalidate = false; if (buffer_pos + size > buffer_size) { + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); + glInvalidateBufferData(gl_buffer.handle); + buffer_pos = 0; invalidate = true; - - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } } - if (invalidate || !persistent) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | - (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | - (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); - mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); - mapped_offset = buffer_pos; - } - - return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate); + return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); } void OGLStreamBuffer::Unmap(GLsizeiptr size) { ASSERT(size <= mapped_size); - if (!coherent && size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); - } - - if (!persistent) { - glUnmapNamedBuffer(gl_buffer.handle); + if (size > 0) { + glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); } buffer_pos += size; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index f8383cbd4..307a67113 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -11,15 +11,13 @@ namespace OpenGL { +class Device; + class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, - bool use_persistent = true); + explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); ~OGLStreamBuffer(); - GLuint GetHandle() const; - GLsizeiptr GetSize() const; - /* * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes * and the optional alignment requirement. @@ -32,15 +30,24 @@ public: void Unmap(GLsizeiptr size); + GLuint Handle() const { + return gl_buffer.handle; + } + + u64 Address() const { + return gpu_address; + } + + GLsizeiptr Size() const noexcept { + return buffer_size; + } + private: OGLBuffer gl_buffer; - bool coherent = false; - bool persistent = false; - + GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; - GLintptr mapped_offset = 0; GLsizeiptr mapped_size = 0; u8* mapped_ptr = nullptr; }; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2729d1265..a863ef218 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -35,96 +35,109 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", namespace { struct FormatTuple { - GLint internal_format; + GLenum internal_format; GLenum format = GL_NONE; GLenum type = GL_NONE; }; constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // ABGR8U - {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // ABGR8S - {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // ABGR8UI - {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5U - {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10U - {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5U - {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8U - {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8UI - {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // RGBA16F - {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // RGBA16U - {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // RGBA16S - {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // RGBA16UI - {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // R11FG11FB10F - {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // RGBA32UI - {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // DXT1 - {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // DXT23 - {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // DXT45 - {GL_COMPRESSED_RED_RGTC1}, // DXN1 - {GL_COMPRESSED_RG_RGTC2}, // DXN2UNORM - {GL_COMPRESSED_SIGNED_RG_RGTC2}, // DXN2SNORM - {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7U - {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UF16 - {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SF16 - {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4 - {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8 - {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // RGBA32F - {GL_RG32F, GL_RG, GL_FLOAT}, // RG32F - {GL_R32F, GL_RED, GL_FLOAT}, // R32F - {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16F - {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16U - {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16S - {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16UI - {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16I - {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // RG16 - {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // RG16F - {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // RG16UI - {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // RG16I - {GL_RG16_SNORM, GL_RG, GL_SHORT}, // RG16S - {GL_RGB32F, GL_RGB, GL_FLOAT}, // RGB32F - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // RGBA8_SRGB - {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8U - {GL_RG8_SNORM, GL_RG, GL_BYTE}, // RG8S - {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // RG32UI - {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // RGBX16F - {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32UI - {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32I - {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8 - {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5 - {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4 - {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_UNORM + {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // A8B8G8R8_SNORM + {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE}, // A8B8G8R8_SINT + {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // A8B8G8R8_UINT + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // R5G6B5_UNORM + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5_UNORM + {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1R5G5B5_UNORM + {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UNORM + {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT + {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5_UNORM + {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8_UNORM + {GL_R8_SNORM, GL_RED, GL_BYTE}, // R8_SNORM + {GL_R8I, GL_RED_INTEGER, GL_BYTE}, // R8_SINT + {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8_UINT + {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16A16_FLOAT + {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // R16G16B16A16_UNORM + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // R16G16B16A16_SNORM + {GL_RGBA16I, GL_RGBA_INTEGER, GL_SHORT}, // R16G16B16A16_SINT + {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // R16G16B16A16_UINT + {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // B10G11R11_FLOAT + {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // R32G32B32A32_UINT + {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // BC1_RGBA_UNORM + {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // BC2_UNORM + {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // BC3_UNORM + {GL_COMPRESSED_RED_RGTC1}, // BC4_UNORM + {GL_COMPRESSED_SIGNED_RED_RGTC1}, // BC4_SNORM + {GL_COMPRESSED_RG_RGTC2}, // BC5_UNORM + {GL_COMPRESSED_SIGNED_RG_RGTC2}, // BC5_SNORM + {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7_UNORM + {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UFLOAT + {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SFLOAT + {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4_UNORM + {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM + {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // R32G32B32A32_FLOAT + {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT}, // R32G32B32A32_SINT + {GL_RG32F, GL_RG, GL_FLOAT}, // R32G32_FLOAT + {GL_RG32I, GL_RG_INTEGER, GL_INT}, // R32G32_SINT + {GL_R32F, GL_RED, GL_FLOAT}, // R32_FLOAT + {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16_FLOAT + {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16_UNORM + {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16_SNORM + {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16_UINT + {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16_SINT + {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // R16G16_UNORM + {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // R16G16_FLOAT + {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // R16G16_UINT + {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // R16G16_SINT + {GL_RG16_SNORM, GL_RG, GL_SHORT}, // R16G16_SNORM + {GL_RGB32F, GL_RGB, GL_FLOAT}, // R32G32B32_FLOAT + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_SRGB + {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // R8G8_UNORM + {GL_RG8_SNORM, GL_RG, GL_BYTE}, // R8G8_SNORM + {GL_RG8I, GL_RG_INTEGER, GL_BYTE}, // R8G8_SINT + {GL_RG8UI, GL_RG_INTEGER, GL_UNSIGNED_BYTE}, // R8G8_UINT + {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // R32G32_UINT + {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16X16_FLOAT + {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32_UINT + {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32_SINT + {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8_UNORM + {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM + {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM + {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM // Compressed sRGB formats - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // DXT1_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // DXT23_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // DXT45_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7U_SRGB - {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // R4G4B4A4U + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB + {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB - {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5 + {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8 + {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6 + {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10 + {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB - {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12 + {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB - {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6 + {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5 + {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB - {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9F + {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT // Depth formats - {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // Z32F - {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // Z16 + {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT + {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM // DepthStencil formats - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // Z24S8 - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8Z24 - {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // Z32FS8 + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM + {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, + GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT }}; const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { @@ -177,10 +190,10 @@ GLint GetSwizzleSource(SwizzleSource source) { GLenum GetComponent(PixelFormat format, bool is_first) { switch (format) { - case PixelFormat::Z24S8: - case PixelFormat::Z32FS8: + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; - case PixelFormat::S8Z24: + case PixelFormat::S8_UINT_D24_UNORM: return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; default: UNREACHABLE(); @@ -237,6 +250,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte return texture; } +constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, + SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +} + } // Anonymous namespace CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params, @@ -256,9 +275,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param target = GetTextureTarget(params.target); texture = CreateTexture(params, target, internal_format, texture_buffer); DecorateSurfaceName(); - main_view = CreateViewInner( - ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), - true); + + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + + main_view = + CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } CachedSurface::~CachedSurface() = default; @@ -379,8 +403,8 @@ void CachedSurface::DecorateSurfaceName() { LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName()); } -void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) { - LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix); +void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) { + LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix); } View CachedSurface::CreateView(const ViewParams& view_key) { @@ -396,32 +420,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr } CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, - const bool is_proxy) - : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { - target = GetTextureTarget(params.target); - format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format; + bool is_proxy) + : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, + target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { if (!is_proxy) { - texture_view = CreateTextureView(); + main_view = CreateTextureView(); } - swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A); } CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { ASSERT(params.num_levels == 1); + if (params.target == SurfaceTarget::Texture3D) { + if (params.num_layers > 1) { + ASSERT(params.base_layer == 0); + glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); + } else { + glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, + params.base_level, params.base_layer); + } + return; + } + if (params.num_layers > 1) { - // Layered framebuffer attachments UNIMPLEMENTED_IF(params.base_layer != 0); - - switch (params.target) { - case SurfaceTarget::Texture2DArray: - glFramebufferTexture(target, attachment, GetTexture(), 0); - break; - default: - UNIMPLEMENTED(); - } + glFramebufferTexture(fb_target, attachment, GetTexture(), 0); return; } @@ -429,16 +454,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { const GLuint texture = surface.GetTexture(); switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, texture, params.base_level, + glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -446,44 +471,73 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { } } -void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source, +GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, SwizzleSource w_source) { - u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (new_swizzle == swizzle) - return; - swizzle = new_swizzle; - const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), - GetSwizzleSource(z_source), GetSwizzleSource(w_source)}; - const GLuint handle = GetTexture(); - const PixelFormat format = surface.GetSurfaceParams().pixel_format; - switch (format) { - case PixelFormat::Z24S8: - case PixelFormat::Z32FS8: - case PixelFormat::S8Z24: - glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + if (GetSurfaceParams().IsBuffer()) { + return GetTexture(); + } + const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (current_swizzle == new_swizzle) { + return current_view; + } + current_swizzle = new_swizzle; + + const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); + OGLTextureView& view = entry->second; + if (!is_cache_miss) { + current_view = view.handle; + return view.handle; + } + view = CreateTextureView(); + current_view = view.handle; + + std::array swizzle{x_source, y_source, z_source, w_source}; + + switch (const PixelFormat format = GetSurfaceParams().pixel_format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + case PixelFormat::S8_UINT_D24_UNORM: + UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); + glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE, GetComponent(format, x_source == SwizzleSource::R)); - break; - default: - glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + + // Make sure we sample the first component + std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) { + return value == SwizzleSource::G ? SwizzleSource::R : value; + }); + [[fallthrough]]; + default: { + const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]), + GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])}; + glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); break; } + } + return view.handle; } OGLTextureView CachedSurfaceView::CreateTextureView() const { OGLTextureView texture_view; texture_view.Create(); - glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, - params.num_levels, params.base_layer, params.num_layers); + if (target == GL_TEXTURE_3D) { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, 0, 1); + } else { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, params.base_layer, params.num_layers); + } ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; } -TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, - VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker) - : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} { +TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory, const Device& device, + StateTracker& state_tracker_) + : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{ + state_tracker_} { src_framebuffer.Create(); dst_framebuffer.Create(); } @@ -517,8 +571,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(src_params.depth != 1); + UNIMPLEMENTED_IF(dst_params.depth != 1); state_tracker.NotifyScissor0(); state_tracker.NotifyFramebuffer(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 02d9981a1..7787134fc 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,15 +80,17 @@ public: explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); ~CachedSurfaceView(); - /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER - void Attach(GLenum attachment, GLenum target) const; + /// @brief Attaches this texture view to the currently bound fb_target framebuffer + /// @param attachment Attachment to bind textures to + /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) + void Attach(GLenum attachment, GLenum fb_target) const; - void ApplySwizzle(Tegra::Texture::SwizzleSource x_source, + GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source); - void DecorateViewName(GPUVAddr gpu_addr, std::string prefix); + void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix); void MarkAsModified(u64 tick) { surface.MarkAsModified(true, tick); @@ -98,7 +100,7 @@ public: if (is_proxy) { return surface.GetTexture(); } - return texture_view.handle; + return main_view.handle; } GLenum GetFormat() const { @@ -110,29 +112,27 @@ public: } private: - u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) const { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); - } - OGLTextureView CreateTextureView() const; CachedSurface& surface; - GLenum target{}; - GLenum format{}; + const GLenum format; + const GLenum target; + const bool is_proxy; + + std::unordered_map<u32, OGLTextureView> view_cache; + OGLTextureView main_view; - OGLTextureView texture_view; - u32 swizzle{}; - bool is_proxy{}; + // Use an invalid default so it always fails the comparison test + u32 current_swizzle = 0xffffffff; + GLuint current_view = 0; }; class TextureCacheOpenGL final : public TextureCacheBase { public: - explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker); + explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory, const Device& device, + StateTracker& state_tracker); ~TextureCacheOpenGL(); protected: diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 89f0e04ef..a8be2aa37 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -24,10 +24,11 @@ namespace MaxwellToGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -inline GLenum VertexType(Maxwell::VertexAttribute attrib) { +inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { switch (attrib.type) { - case Maxwell::VertexAttribute::Type::UnsignedInt: case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::UnsignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -47,11 +48,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } - case Maxwell::VertexAttribute::Type::SignedInt: + break; case Maxwell::VertexAttribute::Type::SignedNorm: + case Maxwell::VertexAttribute::Type::SignedScaled: + case Maxwell::VertexAttribute::Type::SignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -71,9 +73,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } + break; case Maxwell::VertexAttribute::Type::Float: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_16: @@ -87,45 +89,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - case Maxwell::VertexAttribute::Type::UnsignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_UNSIGNED_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_UNSIGNED_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - case Maxwell::VertexAttribute::Type::SignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - return {}; + break; } + UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(), + attrib.SizeString()); + return {}; } inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { @@ -137,8 +107,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format)); - UNREACHABLE(); + UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format)); return {}; } @@ -180,31 +149,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { } inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, - Tegra::Texture::TextureMipmapFilter mip_filter_mode) { + Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) { switch (filter_mode) { - case Tegra::Texture::TextureFilter::Linear: { - switch (mip_filter_mode) { + case Tegra::Texture::TextureFilter::Nearest: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_LINEAR; + return GL_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_LINEAR_MIPMAP_NEAREST; + return GL_NEAREST_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_LINEAR_MIPMAP_LINEAR; + return GL_NEAREST_MIPMAP_LINEAR; } - } - case Tegra::Texture::TextureFilter::Nearest: { - switch (mip_filter_mode) { + break; + case Tegra::Texture::TextureFilter::Linear: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_NEAREST; + return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_NEAREST; + return GL_LINEAR_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_NEAREST_MIPMAP_LINEAR; + return GL_LINEAR_MIPMAP_LINEAR; } + break; } - } - LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode)); - return GL_LINEAR; + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", + static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode)); + return GL_NEAREST; } inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { @@ -227,10 +197,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { } else { return GL_MIRROR_CLAMP_TO_EDGE; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); - return GL_REPEAT; + case Tegra::Texture::WrapMode::MirrorOnceClampOGL: + if (GL_EXT_texture_mirror_clamp) { + return GL_MIRROR_CLAMP_EXT; + } else { + return GL_MIRROR_CLAMP_TO_EDGE; + } } + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { @@ -252,8 +227,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}", - static_cast<u32>(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func)); return GL_GREATER; } @@ -275,7 +249,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); return GL_FUNC_ADD; } @@ -339,7 +313,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); return GL_ZERO; } @@ -359,7 +333,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source)); return GL_ZERO; } @@ -390,7 +364,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); return GL_ALWAYS; } @@ -421,7 +395,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil)); return GL_KEEP; } @@ -432,7 +406,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face)); return GL_CCW; } @@ -445,7 +419,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); return GL_BACK; } @@ -484,7 +458,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation)); return GL_COPY; } @@ -501,5 +475,10 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { return GL_FILL; } +inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) { + // Enumeration order matches register order. We can convert it arithmetically. + return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle); +} + } // namespace MaxwellToGL } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b2a179746..2ccca1993 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -21,6 +21,8 @@ #include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" +#include "video_core/host_shaders/opengl_present_frag.h" +#include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -30,60 +32,6 @@ namespace OpenGL { namespace { -constexpr std::size_t SWAP_CHAIN_SIZE = 3; - -struct Frame { - u32 width{}; /// Width of the frame (to detect resize) - u32 height{}; /// Height of the frame - bool color_reloaded{}; /// Texture attachment was recreated (ie: resized) - OpenGL::OGLRenderbuffer color{}; /// Buffer shared between the render/present FBO - OpenGL::OGLFramebuffer render{}; /// FBO created on the render thread - OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread - GLsync render_fence{}; /// Fence created on the render thread - GLsync present_fence{}; /// Fence created on the presentation thread - bool is_srgb{}; /// Framebuffer is sRGB or RGB -}; - -constexpr char VERTEX_SHADER[] = R"( -#version 430 core - -out gl_PerVertex { - vec4 gl_Position; -}; - -layout (location = 0) in vec2 vert_position; -layout (location = 1) in vec2 vert_tex_coord; -layout (location = 0) out vec2 frag_tex_coord; - -// This is a truncated 3x3 matrix for 2D transformations: -// The upper-left 2x2 submatrix performs scaling/rotation/mirroring. -// The third column performs translation. -// The third row could be used for projection, which we don't need in 2D. It hence is assumed to -// implicitly be [0, 0, 1] -layout (location = 0) uniform mat3x2 modelview_matrix; - -void main() { - // Multiply input position by the rotscale part of the matrix and then manually translate by - // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector - // to `vec3(vert_position.xy, 1.0)` - gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0); - frag_tex_coord = vert_tex_coord; -} -)"; - -constexpr char FRAGMENT_SHADER[] = R"( -#version 430 core - -layout (location = 0) in vec2 frag_tex_coord; -layout (location = 0) out vec4 color; - -layout (binding = 0) uniform sampler2D color_texture; - -void main() { - color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); -} -)"; - constexpr GLint PositionLocation = 0; constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; @@ -96,24 +44,6 @@ struct ScreenRectVertex { std::array<GLfloat, 2> tex_coord; }; -/// Returns true if any debug tool is attached -bool HasDebugTool() { - const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - if (nsight) { - return true; - } - - GLint num_extensions; - glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); - for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { - const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); - if (!std::strcmp(name, "GL_EXT_debug_tool")) { - return true; - } - } - return false; -} - /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -197,132 +127,15 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; - glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } - - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } -}; - -RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, - Core::Frontend::GraphicsContext& context) - : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, - has_debug_tool{HasDebugTool()} {} +RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window_, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context) + : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_}, + emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} RendererOpenGL::~RendererOpenGL() = default; -MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64)); -MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128)); - void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { if (!framebuffer) { return; @@ -331,79 +144,34 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { PrepareRendertarget(framebuffer); RenderScreenshot(); - Frame* frame; - { - MICROPROFILE_SCOPE(OpenGL_WaitPresent); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + DrawScreen(emu_window.GetFramebufferLayout()); - frame = frame_mailbox->GetRenderFrame(); + ++m_current_frame; - // Clean up sync objects before drawing - - // INTEL driver workaround. We can't delete the previous render sync object until we are - // sure that the presentation is done - if (frame->present_fence) { - glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - } - - // delete the draw fence if the frame wasn't presented - if (frame->render_fence) { - glDeleteSync(frame->render_fence); - frame->render_fence = 0; - } - - // wait for the presentation to be done - if (frame->present_fence) { - glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - glDeleteSync(frame->present_fence); - frame->present_fence = 0; - } - } - - { - MICROPROFILE_SCOPE(OpenGL_RenderFrame); - const auto& layout = render_window.GetFramebufferLayout(); - - // Recreate the frame if the size of the window has changed - if (layout.width != frame->width || layout.height != frame->height || - screen_info.display_srgb != frame->is_srgb) { - LOG_DEBUG(Render_OpenGL, "Reloading render frame"); - frame->is_srgb = screen_info.display_srgb; - frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle); - DrawScreen(layout); - // Create a fence for the frontend to wait on and swap this frame to OffTex - frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - frame_mailbox->ReleaseRenderFrame(frame); - m_current_frame++; - rasterizer->TickFrame(); - } + rasterizer->TickFrame(); render_window.PollEvents(); - if (has_debug_tool) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); - Present(0); - context.SwapBuffers(); - } + context->SwapBuffers(); } void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { - if (framebuffer) { - // If framebuffer is provided, reload it from memory to a texture - if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || - screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || - screen_info.texture.pixel_format != framebuffer->pixel_format || - gl_framebuffer_data.empty()) { - // Reallocate texture if the framebuffer size has changed. - // This is expected to not happen very often and hence should not be a - // performance problem. - ConfigureFramebufferTexture(screen_info.texture, *framebuffer); - } - - // Load the framebuffer from memory, draw it to the screen, and swap buffers - LoadFBToScreenInfo(*framebuffer); + if (!framebuffer) { + return; + } + // If framebuffer is provided, reload it from memory to a texture + if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || + screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || + screen_info.texture.pixel_format != framebuffer->pixel_format || + gl_framebuffer_data.empty()) { + // Reallocate texture if the framebuffer size has changed. + // This is expected to not happen very often and hence should not be a + // performance problem. + ConfigureFramebufferTexture(screen_info.texture, *framebuffer); } + + // Load the framebuffer from memory, draw it to the screen, and swap buffers + LoadFBToScreenInfo(*framebuffer); } void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) { @@ -423,7 +191,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; - u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)}; + u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)}; rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes); // TODO(Rodrigo): Read this from HLE @@ -453,23 +221,22 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color } void RendererOpenGL::InitOpenGLObjects() { - frame_mailbox = std::make_unique<FrameMailbox>(); - - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(), + Settings::values.bg_blue.GetValue(), 0.0f); // Create shader programs OGLShader vertex_shader; - vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER); + vertex_shader.Create(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); OGLShader fragment_shader; - fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER); + fragment_shader.Create(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); vertex_program.Create(true, false, vertex_shader.handle); fragment_program.Create(true, false, fragment_shader.handle); - // Create program pipeline - program_manager.Create(); + pipeline.Create(); + glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); + glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle); // Generate VBO handle for drawing vertex_buffer.Create(); @@ -487,6 +254,15 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + + // Enable unified vertex attributes and query vertex buffer address when the driver supports it + if (device.HasVertexBufferUnifiedMemory()) { + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + + glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, + &vertex_buffer_address); + } } void RendererOpenGL::AddTelemetryFields() { @@ -498,18 +274,18 @@ void RendererOpenGL::AddTelemetryFields() { LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor); LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model); - auto& telemetry_session = system.TelemetrySession(); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version); + constexpr auto user_system = Common::Telemetry::FieldType::UserSystem; + telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor); + telemetry_session.AddField(user_system, "GPU_Model", gpu_model); + telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version); } void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, - program_manager, state_tracker); + rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device, + screen_info, program_manager, state_tracker); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, @@ -525,12 +301,12 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, GLint internal_format; switch (framebuffer.pixel_format) { - case Tegra::FramebufferConfig::PixelFormat::ABGR8: + case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM: internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; break; - case Tegra::FramebufferConfig::PixelFormat::RGB565: + case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM: internal_format = GL_RGB565; texture.gl_format = GL_RGB; texture.gl_type = GL_UNSIGNED_SHORT_5_6_5; @@ -551,8 +327,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { if (renderer_settings.set_background_color) { // Update background color before drawing - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(), + Settings::values.bg_blue.GetValue(), 0.0f); } // Set projection matrix @@ -620,10 +396,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { state_tracker.NotifyClipControl(); state_tracker.NotifyAlphaTest(); - program_manager.UseVertexShader(vertex_program.handle); - program_manager.UseGeometryShader(0); - program_manager.UseFragmentShader(fragment_program.handle); - program_manager.BindGraphicsPipeline(); + program_manager.BindHostPipeline(pipeline.handle); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { @@ -658,58 +431,21 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { offsetof(ScreenRectVertex, tex_coord)); glVertexAttribBinding(PositionLocation, 0); glVertexAttribBinding(TexCoordLocation, 0); - glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + if (device.HasVertexBufferUnifiedMemory()) { + glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address, + sizeof(vertices)); + } else { + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + } glBindTextureUnit(0, screen_info.display_texture); glBindSampler(0, 0); glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); -} -bool RendererOpenGL::TryPresent(int timeout_ms) { - if (has_debug_tool) { - LOG_DEBUG(Render_OpenGL, - "Skipping presentation because we are presenting on the main context"); - return false; - } - return Present(timeout_ms); -} - -bool RendererOpenGL::Present(int timeout_ms) { - const auto& layout = render_window.GetFramebufferLayout(); - auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms); - if (!frame) { - LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present"); - return false; - } - - // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a - // readback since we won't be doing any blending - glClear(GL_COLOR_BUFFER_BIT); - - // Recreate the presentation FBO if the color attachment was changed - if (frame->color_reloaded) { - LOG_DEBUG(Render_OpenGL, "Reloading present frame"); - frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height); - } - glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED); - // INTEL workaround. - // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete - // it on the emulation thread without too much penalty - // glDeleteSync(frame.render_sync); - // frame.render_sync = 0; - - glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle); - glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height, - GL_COLOR_BUFFER_BIT, GL_LINEAR); - - // Insert fence for the main thread to block on - frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - - glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); - return true; + program_manager.RestoreGuestPipeline(); } void RendererOpenGL::RenderScreenshot() { @@ -726,7 +462,7 @@ void RendererOpenGL::RenderScreenshot() { screenshot_framebuffer.Create(); glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); - Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; + const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; GLuint renderbuffer; glGenRenderbuffers(1, &renderbuffer); @@ -751,8 +487,9 @@ void RendererOpenGL::RenderScreenshot() { } bool RendererOpenGL::Init() { - if (GLAD_GL_KHR_debug) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); glDebugMessageCallback(DebugHandler, nullptr); } diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 50b647661..9ef181f95 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -9,22 +9,32 @@ #include "common/common_types.h" #include "common/math_util.h" #include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" namespace Core { class System; -} +class TelemetrySession; +} // namespace Core namespace Core::Frontend { class EmuWindow; } +namespace Core::Memory { +class Memory; +} + namespace Layout { struct FramebufferLayout; } +namespace Tegra { +class GPU; +} + namespace OpenGL { /// Structure used for storing information about the textures for the Switch screen @@ -45,24 +55,17 @@ struct ScreenInfo { TextureInfo texture; }; -struct PresentationTexture { - u32 width = 0; - u32 height = 0; - OGLTexture texture; -}; - -class FrameMailbox; - class RendererOpenGL final : public VideoCore::RendererBase { public: - explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, - Core::Frontend::GraphicsContext& context); + explicit RendererOpenGL(Core::TelemetrySession& telemetry_session, + Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory, + Tegra::GPU& gpu, + std::unique_ptr<Core::Frontend::GraphicsContext> context); ~RendererOpenGL() override; bool Init() override; void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - bool TryPresent(int timeout_ms) override; private: /// Initializes the OpenGL state and creates persistent objects. @@ -90,37 +93,36 @@ private: void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer); - bool Present(int timeout_ms); - + Core::TelemetrySession& telemetry_session; Core::Frontend::EmuWindow& emu_window; - Core::System& system; - Core::Frontend::GraphicsContext& context; + Core::Memory::Memory& cpu_memory; + Tegra::GPU& gpu; - StateTracker state_tracker{system}; + const Device device; + StateTracker state_tracker{gpu}; // OpenGL object IDs OGLBuffer vertex_buffer; OGLProgram vertex_program; OGLProgram fragment_program; + OGLPipeline pipeline; OGLFramebuffer screenshot_framebuffer; + // GPU address of the vertex buffer + GLuint64EXT vertex_buffer_address = 0; + /// Display information for Switch screen ScreenInfo screen_info; /// Global dummy shader pipeline - GLShader::ProgramManager program_manager; + ProgramManager program_manager; /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; /// Used for transforming the framebuffer orientation - Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags; + Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{}; Common::Rectangle<int> framebuffer_crop_rect; - - /// Frame presentation mailbox - std::unique_ptr<FrameMailbox> frame_mailbox; - - bool has_debug_tool = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index b751086fa..6d7bb16b2 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -14,68 +14,6 @@ namespace OpenGL { -struct VertexArrayPushBuffer::Entry { - GLuint binding_index{}; - const GLuint* buffer{}; - GLintptr offset{}; - GLsizei stride{}; -}; - -VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker) - : state_tracker{state_tracker} {} - -VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; - -void VertexArrayPushBuffer::Setup() { - index_buffer = nullptr; - vertex_buffers.clear(); -} - -void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) { - index_buffer = buffer; -} - -void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer, - GLintptr offset, GLsizei stride) { - vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride}); -} - -void VertexArrayPushBuffer::Bind() { - if (index_buffer) { - state_tracker.BindIndexBuffer(*index_buffer); - } - - for (const auto& entry : vertex_buffers) { - glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride); - } -} - -struct BindBuffersRangePushBuffer::Entry { - GLuint binding; - const GLuint* buffer; - GLintptr offset; - GLsizeiptr size; -}; - -BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} - -BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; - -void BindBuffersRangePushBuffer::Setup() { - entries.clear(); -} - -void BindBuffersRangePushBuffer::Push(GLuint binding, const GLuint* buffer, GLintptr offset, - GLsizeiptr size) { - entries.push_back(Entry{binding, buffer, offset, size}); -} - -void BindBuffersRangePushBuffer::Bind() { - for (const Entry& entry : entries) { - glBindBufferRange(target, entry.binding, *entry.buffer, entry.offset, entry.size); - } -} - void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) { if (!GLAD_GL_KHR_debug) { // We don't need to throw an error as this is just for debugging diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 47ee3177b..9c09ee12c 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,49 +11,6 @@ namespace OpenGL { -class StateTracker; - -class VertexArrayPushBuffer final { -public: - explicit VertexArrayPushBuffer(StateTracker& state_tracker); - ~VertexArrayPushBuffer(); - - void Setup(); - - void SetIndexBuffer(const GLuint* buffer); - - void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset, - GLsizei stride); - - void Bind(); - -private: - struct Entry; - - StateTracker& state_tracker; - - const GLuint* index_buffer{}; - std::vector<Entry> vertex_buffers; -}; - -class BindBuffersRangePushBuffer final { -public: - explicit BindBuffersRangePushBuffer(GLenum target); - ~BindBuffersRangePushBuffer(); - - void Setup(); - - void Push(GLuint binding, const GLuint* buffer, GLintptr offset, GLsizeiptr size); - - void Bind(); - -private: - struct Entry; - - GLenum target; - std::vector<Entry> entries; -}; - void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {}); } // namespace OpenGL |
