diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 50 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 12 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 107 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 12 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 99 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.h | 6 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.cpp | 3 |
8 files changed, 196 insertions, 97 deletions
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 466a911db..d20547c04 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -6,6 +6,7 @@ #include <array> #include <cstddef> #include <cstring> +#include <limits> #include <optional> #include <vector> @@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, - GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, - GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; +constexpr std::array LimitUBOs = { + GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, + GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; constexpr std::array LimitSSBOs = { - GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, + GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; -constexpr std::array LimitSamplers = { - GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS}; +constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; -constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, - GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, - GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, - GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; +constexpr std::array LimitImages = { + GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, + GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; template <typename T> T GetInteger(GLenum pname) { @@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { return std::exchange(base, base + amount); } +std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { + std::array<u32, Tegra::Engines::MaxShaderTypes> max; + std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), + [](GLenum pname) { return GetInteger<u32>(pname); }); + return max; +} + std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; @@ -159,15 +170,14 @@ bool IsASTCSupported() { } // Anonymous namespace -Device::Device() : base_bindings{BuildBaseBindings()} { +Device::Device() + : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; - const bool is_intel = vendor == "Intel"; - const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr; uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); @@ -182,7 +192,6 @@ Device::Device() : base_bindings{BuildBaseBindings()} { has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_broken_compute = is_intel_proprietary; has_fast_buffer_sub_data = is_nvidia; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5; @@ -197,7 +206,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} { } Device::Device(std::nullptr_t) { - uniform_buffer_alignment = 0; + max_uniform_buffers.fill(std::numeric_limits<u32>::max()); + uniform_buffer_alignment = 4; + shader_storage_alignment = 4; max_vertex_attributes = 16; max_varyings = 15; has_warp_intrinsics = true; @@ -205,9 +216,6 @@ Device::Device(std::nullptr_t) { has_vertex_viewport_layer = true; has_image_load_formatted = true; has_variable_aoffi = true; - has_component_indexing_bug = false; - has_broken_compute = false; - has_precise_bug = false; } bool Device::TestVariableAoffi() { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index e915dbd86..98cca0254 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -24,6 +24,10 @@ public: explicit Device(); explicit Device(std::nullptr_t); + u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { + return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; + } + const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { return base_bindings[stage_index]; } @@ -80,10 +84,6 @@ public: return has_precise_bug; } - bool HasBrokenCompute() const { - return has_broken_compute; - } - bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } @@ -96,7 +96,8 @@ private: static bool TestVariableAoffi(); static bool TestPreciseBug(); - std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; + std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; + std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; std::size_t uniform_buffer_alignment{}; std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; @@ -109,7 +110,6 @@ private: bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; - bool has_broken_compute{}; bool has_fast_buffer_sub_data{}; bool use_assembly_shaders{}; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 716d43e65..55e79aaf6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { +constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; +constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = + NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; +constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = + NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; + constexpr std::size_t NumSupportedVertexAttributes = 16; template <typename Engine, typename Entry> @@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { CheckExtensions(); + unified_uniform_buffer.Create(); + glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); + if (device.UseAssemblyShaders()) { glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); for (const GLuint cbuf : staging_cbufs) { @@ -655,10 +664,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - if (device.HasBrokenCompute()) { - return; - } - buffer_cache.Acquire(); current_cbuf = 0; @@ -846,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad MICROPROFILE_SCOPE(OpenGL_UBO); const auto& stages = system.GPU().Maxwell3D().state.shader_stages; const auto& shader_stage = stages[stage_index]; + const auto& entries = shader->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; + const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; - u32 binding = - device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetEntries().const_buffers) { - const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; - SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); + const auto base_bindings = device.GetBaseBindings(stage_index); + u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; + for (const auto& entry : entries.const_buffers) { + const u32 index = entry.GetIndex(); + const auto& buffer = shader_stage.const_buffers[index]; + SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, + base_unified_offset + index * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + + entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, + base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& entries = kernel->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; u32 binding = 0; - for (const auto& entry : kernel->GetEntries().const_buffers) { + for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; buffer.address = config.Address(); buffer.size = config.size; buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); + SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, + use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, + NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset) { if (!buffer.enabled) { // Set values to zero to unbind buffers if (device.UseAssemblyShaders()) { @@ -889,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, // UBO alignment requirements. const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const auto alignment = device.GetUniformBufferAlignment(); - auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, - device.HasFastBufferSubData()); - if (!device.UseAssemblyShaders()) { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + const bool fast_upload = !use_unified && device.HasFastBufferSubData(); + + const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); + const GPUVAddr gpu_addr = buffer.address; + auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + + if (device.UseAssemblyShaders()) { + UNIMPLEMENTED_IF(use_unified); + if (offset != 0) { + const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; + glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); + cbuf = staging_cbuf; + offset = 0; + } + glBindBufferRangeNV(stage, binding, cbuf, offset, size); return; } - if (offset != 0) { - const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); - cbuf = staging_cbuf; - offset = 0; + + if (use_unified) { + glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); } - glBindBufferRangeNV(stage, binding, cbuf, offset, size); } void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { @@ -1024,6 +1060,26 @@ void RasterizerOpenGL::SyncViewport() { const auto& regs = gpu.regs; const bool dirty_viewport = flags[Dirty::Viewports]; + const bool dirty_clip_control = flags[Dirty::ClipControl]; + + if (dirty_clip_control || flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + + GLenum mode = MaxwellToGL::FrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0 && + regs.viewport_transform[0].scale_y < 0.0f) { + switch (mode) { + case GL_CW: + mode = GL_CCW; + break; + case GL_CCW: + mode = GL_CW; + break; + } + } + glFrontFace(mode); + } + if (dirty_viewport || flags[Dirty::ClipControl]) { flags[Dirty::ClipControl] = false; @@ -1121,11 +1177,6 @@ void RasterizerOpenGL::SyncCullMode() { glDisable(GL_CULL_FACE); } } - - if (flags[Dirty::FrontFace]) { - flags[Dirty::FrontFace] = false; - glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); - } } void RasterizerOpenGL::SyncPrimitiveRestart() { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 87f7fe159..f5dc56a0e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -107,7 +107,8 @@ private: /// Configures a constant buffer. void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry); + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); @@ -253,6 +254,7 @@ private: Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; std::size_t current_cbuf = 0; + OGLBuffer unified_uniform_buffer; /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 4cd0f36cf..a991ca64a 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + return std::shared_ptr<CachedShader>( + new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), + MakeEntries(params.device, ir, shader_type), std::move(program))); } Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { @@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + return std::shared_ptr<CachedShader>( + new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); } Shader CachedShader::CreateFromCache(const ShaderParameters& params, @@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, PrecompiledShader shader; shader.program = std::move(program); shader.registry = std::move(registry); - shader.entries = MakeEntries(ir); + shader.entries = MakeEntries(device, ir, entry.type); std::scoped_lock lock{mutex}; if (callback) { diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 9cb115959..502b95973 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -61,8 +61,8 @@ struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; -constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); +constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint @@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { + const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); + // We waste one UBO for emulation + const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; + return num_ubos > num_available_ubos; +} + struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -412,8 +419,9 @@ class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, - identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, + suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ + UseUnifiedUniforms(device, ir, stage)} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -834,12 +842,24 @@ private: } void DeclareConstantBuffers() { + if (use_unified_uniforms) { + const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + + static_cast<u32>(ir.GetGlobalMemory().size()); + code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", + binding); + code.AddLine(" uint cbufs[];"); + code.AddLine("}};"); + code.AddNewLine(); + return; + } + u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto& buffers : ir.GetConstantBuffers()) { - const auto index = buffers.first; + for (const auto [index, info] : ir.GetConstantBuffers()) { + const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; + const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, GetConstBufferBlock(index)); - code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); + code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); code.AddLine("}};"); code.AddNewLine(); } @@ -1038,42 +1058,51 @@ private: if (const auto cbuf = std::get_if<CbufNode>(&*node)) { const Node offset = cbuf->GetOffset(); + const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; + if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; + if (use_unified_uniforms) { + return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), + Type::Uint}; + } else { + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), + Type::Uint}; + } } - if (std::holds_alternative<OperationNode>(*offset)) { - // Indirect access - const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); + // Indirect access + if (use_unified_uniforms) { + return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, + Visit(offset).AsUint()), + Type::Uint}; + } - if (!device.HasComponentIndexingBug()) { - return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset), - Type::Uint}; - } + const std::string final_offset = code.GenerateTemporary(); + code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); - // AMD's proprietary GLSL compiler emits ill code for variable component access. - // To bypass this driver bug generate 4 ifs, one per each component. - const std::string pack = code.GenerateTemporary(); - code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), - final_offset); - - const std::string result = code.GenerateTemporary(); - code.AddLine("uint {};", result); - for (u32 swizzle = 0; swizzle < 4; ++swizzle) { - code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, - pack, GetSwizzle(swizzle)); - } - return {result, Type::Uint}; + if (!device.HasComponentIndexingBug()) { + return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset), + Type::Uint}; } - UNREACHABLE_MSG("Unmanaged offset node type"); + // AMD's proprietary GLSL compiler emits ill code for variable component access. + // To bypass this driver bug generate 4 ifs, one per each component. + const std::string pack = code.GenerateTemporary(); + code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + final_offset); + + const std::string result = code.GenerateTemporary(); + code.AddLine("uint {};", result); + for (u32 swizzle = 0; swizzle < 4; ++swizzle) { + code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, + GetSwizzle(swizzle)); + } + return {result, Type::Uint}; } if (const auto gmem = std::get_if<GmemNode>(&*node)) { @@ -2710,6 +2739,7 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; + const bool use_unified_uniforms; std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; @@ -2905,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2926,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } entries.shader_length = ir.GetLength(); + entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e8a178764..451c9689a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -53,11 +53,13 @@ struct ShaderEntries { std::vector<GlobalMemoryEntry> global_memory_entries; std::vector<SamplerEntry> samplers; std::vector<ImageEntry> images; - u32 clip_distances{}; std::size_t shader_length{}; + u32 clip_distances{}; + bool use_unified_uniforms{}; }; -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); +ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Tegra::Engines::ShaderType stage); std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, const VideoCommon::Shader::Registry& registry, diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 6b489e6db..e7952924a 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -753,6 +753,9 @@ void RendererOpenGL::RenderScreenshot() { bool RendererOpenGL::Init() { if (GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); + if (Settings::values.renderer_debug) { + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); + } glDebugMessageCallback(DebugHandler, nullptr); } |
