diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
24 files changed, 956 insertions, 636 deletions
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 2b9bd142e..2a9b523f5 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -2,103 +2,57 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <cstring> #include <memory> -#include "common/alignment.h" -#include "core/core.h" -#include "video_core/memory_manager.h" +#include <glad/glad.h> + +#include "common/assert.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { -CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, - std::size_t alignment, u8* host_ptr) - : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset}, - alignment{alignment} {} - -OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) - : RasterizerCache{rasterizer}, stream_buffer(size, true) {} - -GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment, - bool cache) { - std::lock_guard lock{mutex}; - auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); - - // Cache management is a big overhead, so only cache entries with a given size. - // TODO: Figure out which size is the best for given games. - cache &= size >= 2048; - - const auto& host_ptr{memory_manager.GetPointer(gpu_addr)}; - if (cache) { - auto entry = TryGet(host_ptr); - if (entry) { - if (entry->GetSize() >= size && entry->GetAlignment() == alignment) { - return entry->GetOffset(); - } - Unregister(entry); - } - } +OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + std::size_t stream_size) + : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{ + rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} - AlignBuffer(alignment); - const GLintptr uploaded_offset = buffer_offset; +OGLBufferCache::~OGLBufferCache() = default; - if (!host_ptr) { - return uploaded_offset; - } - - std::memcpy(buffer_ptr, host_ptr, size); - buffer_ptr += size; - buffer_offset += size; - - if (cache) { - auto entry = std::make_shared<CachedBufferEntry>( - *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr); - Register(entry); - } - - return uploaded_offset; +OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) { + OGLBuffer buffer; + buffer.Create(); + glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); + return buffer; } -GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size, - std::size_t alignment) { - std::lock_guard lock{mutex}; - AlignBuffer(alignment); - std::memcpy(buffer_ptr, raw_pointer, size); - const GLintptr uploaded_offset = buffer_offset; - - buffer_ptr += size; - buffer_offset += size; - return uploaded_offset; +const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) { + return &buffer.handle; } -bool OGLBufferCache::Map(std::size_t max_size) { - bool invalidate; - std::tie(buffer_ptr, buffer_offset_base, invalidate) = - stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4); - buffer_offset = buffer_offset_base; - - if (invalidate) { - InvalidateAll(); - } - return invalidate; +const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { + static const GLuint null_buffer = 0; + return &null_buffer; } -void OGLBufferCache::Unmap() { - stream_buffer.Unmap(buffer_offset - buffer_offset_base); +void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, + const u8* data) { + glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(size), data); } -GLuint OGLBufferCache::GetHandle() const { - return stream_buffer.GetHandle(); +void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, + std::size_t size, u8* data) { + glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(size), data); } -void OGLBufferCache::AlignBuffer(std::size_t alignment) { - // Align the offset, not the mapped pointer - const GLintptr offset_aligned = - static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment)); - buffer_ptr += offset_aligned - buffer_offset; - buffer_offset = offset_aligned; +void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, + std::size_t src_offset, std::size_t dst_offset, + std::size_t size) { + glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset), + static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index f2347581b..8c8ac4038 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -4,80 +4,44 @@ #pragma once -#include <cstddef> #include <memory> -#include <tuple> #include "common/common_types.h" +#include "video_core/buffer_cache.h" #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" +namespace Core { +class System; +} + namespace OpenGL { +class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferEntry final : public RasterizerCacheObject { -public: - explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, - std::size_t alignment, u8* host_ptr); - - VAddr GetCpuAddr() const override { - return cpu_addr; - } - - std::size_t GetSizeInBytes() const override { - return size; - } - - std::size_t GetSize() const { - return size; - } - - GLintptr GetOffset() const { - return offset; - } - - std::size_t GetAlignment() const { - return alignment; - } - -private: - VAddr cpu_addr{}; - std::size_t size{}; - GLintptr offset{}; - std::size_t alignment{}; -}; - -class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { +class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> { public: - explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size); - - /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been - /// allocated. - GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool cache = true); + explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + std::size_t stream_size); + ~OGLBufferCache(); - /// Uploads from a host memory. Returns host's buffer offset where it's been allocated. - GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4); - - bool Map(std::size_t max_size); - void Unmap(); - - GLuint GetHandle() const; + const GLuint* GetEmptyBuffer(std::size_t) override; protected: - void AlignBuffer(std::size_t alignment); + OGLBuffer CreateBuffer(std::size_t size) override; + + const GLuint* ToHandle(const OGLBuffer& buffer) override; - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {} + void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, + const u8* data) override; -private: - OGLStreamBuffer stream_buffer; + void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, + u8* data) override; - u8* buffer_ptr = nullptr; - GLintptr buffer_offset = 0; - GLintptr buffer_offset_base = 0; + void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset, + std::size_t dst_offset, std::size_t size) override; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index a48e14d2e..85424a4c9 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -24,8 +24,10 @@ T GetInteger(GLenum pname) { Device::Device() { uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); + has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = TestComponentIndexingBug(); } @@ -34,6 +36,7 @@ Device::Device(std::nullptr_t) { uniform_buffer_alignment = 0; max_vertex_attributes = 16; max_varyings = 15; + has_vertex_viewport_layer = true; has_variable_aoffi = true; has_component_indexing_bug = false; } diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 8c8c93760..dc883722d 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -18,6 +18,10 @@ public: return uniform_buffer_alignment; } + std::size_t GetShaderStorageBufferAlignment() const { + return shader_storage_alignment; + } + u32 GetMaxVertexAttributes() const { return max_vertex_attributes; } @@ -26,6 +30,10 @@ public: return max_varyings; } + bool HasVertexViewportLayer() const { + return has_vertex_viewport_layer; + } + bool HasVariableAoffi() const { return has_variable_aoffi; } @@ -39,8 +47,10 @@ private: static bool TestComponentIndexingBug(); std::size_t uniform_buffer_alignment{}; + std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; + bool has_vertex_viewport_layer{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; }; diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp deleted file mode 100644 index d5e385151..000000000 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <glad/glad.h> - -#include "common/logging/log.h" -#include "core/core.h" -#include "video_core/memory_manager.h" -#include "video_core/renderer_opengl/gl_global_cache.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_opengl/utils.h" - -namespace OpenGL { - -CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size) - : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size}, - max_size{max_size} { - buffer.Create(); - LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory"); -} - -CachedGlobalRegion::~CachedGlobalRegion() = default; - -void CachedGlobalRegion::Reload(u32 size_) { - size = size_; - if (size > max_size) { - size = max_size; - LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_, - max_size); - } - glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW); -} - -void CachedGlobalRegion::Flush() { - LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr); - glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr); -} - -GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const { - const auto search{reserve.find(addr)}; - if (search == reserve.end()) { - return {}; - } - return search->second; -} - -GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, - u32 size) { - GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; - if (!region) { - // No reserved surface available, create a new one and reserve it - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)}; - ASSERT(cpu_addr); - - region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size); - ReserveGlobalRegion(region); - } - region->Reload(size); - return region; -} - -void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { - reserve.insert_or_assign(region->GetCacheAddr(), std::move(region)); -} - -GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) - : RasterizerCache{rasterizer} { - GLint max_ssbo_size_; - glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_); - max_ssbo_size = static_cast<u32>(max_ssbo_size_); -} - -GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( - const GLShader::GlobalMemoryEntry& global_region, - Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) { - std::lock_guard lock{mutex}; - - auto& gpu{Core::System::GetInstance().GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; - const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + - global_region.GetCbufOffset()}; - const auto actual_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - - // Look up global region in the cache based on address - const auto& host_ptr{memory_manager.GetPointer(actual_addr)}; - GlobalRegion region{TryGet(host_ptr)}; - - if (!region) { - // No global region found - create a new one - region = GetUncachedGlobalRegion(actual_addr, host_ptr, size); - Register(region); - } - - return region; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h deleted file mode 100644 index 2d467a240..000000000 --- a/src/video_core/renderer_opengl/gl_global_cache.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <unordered_map> - -#include <glad/glad.h> - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" - -namespace OpenGL { - -namespace GLShader { -class GlobalMemoryEntry; -} - -class RasterizerOpenGL; -class CachedGlobalRegion; -using GlobalRegion = std::shared_ptr<CachedGlobalRegion>; - -class CachedGlobalRegion final : public RasterizerCacheObject { -public: - explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size); - ~CachedGlobalRegion(); - - VAddr GetCpuAddr() const override { - return cpu_addr; - } - - std::size_t GetSizeInBytes() const override { - return size; - } - - /// Gets the GL program handle for the buffer - GLuint GetBufferHandle() const { - return buffer.handle; - } - - /// Reloads the global region from guest memory - void Reload(u32 size_); - - void Flush(); - -private: - VAddr cpu_addr{}; - u8* host_ptr{}; - u32 size{}; - u32 max_size{}; - - OGLBuffer buffer; -}; - -class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> { -public: - explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer); - - /// Gets the current specified shader stage program - GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor, - Tegra::Engines::Maxwell3D::Regs::ShaderStage stage); - -protected: - void FlushObjectInner(const GlobalRegion& object) override { - object->Flush(); - } - -private: - GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; - GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size); - void ReserveGlobalRegion(GlobalRegion region); - - std::unordered_map<CacheAddr, GlobalRegion> reserve; - u32 max_ssbo_size{}; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f45a3c5ef..c28ae795c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -4,6 +4,7 @@ #include <algorithm> #include <array> +#include <bitset> #include <memory> #include <string> #include <string_view> @@ -19,7 +20,9 @@ #include "core/core.h" #include "core/hle/kernel/process.h" #include "core/settings.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -80,16 +83,31 @@ struct DrawParameters { } }; +static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, + const GLShader::ConstBufferEntry& entry) { + if (!entry.IsIndirect()) { + return entry.GetSize(); + } + + if (buffer.size > Maxwell::MaxConstBufferSize) { + LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, + Maxwell::MaxConstBufferSize); + return Maxwell::MaxConstBufferSize; + } + + return buffer.size; +} + RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, - global_cache{*this}, system{system}, screen_info{info}, - buffer_cache(*this, STREAM_BUFFER_SIZE) { + system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { OpenGLState::ApplyDefaultState(); shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); + clear_framebuffer.Create(); LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); CheckExtensions(); @@ -109,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; - if (!gpu.dirty_flags.vertex_attrib_format) { + if (!gpu.dirty.vertex_attrib_format) { return state.draw.vertex_array; } - gpu.dirty_flags.vertex_attrib_format = false; + gpu.dirty.vertex_attrib_format = false; MICROPROFILE_SCOPE(OpenGL_VAO); @@ -129,8 +147,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { state.draw.vertex_array = vao; state.ApplyVertexArrayState(); - glVertexArrayElementBuffer(vao, buffer_cache.GetHandle()); - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. // Enables the first 16 vertex attributes always, as we don't know which ones are actually // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16 @@ -168,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { } // Rebinding the VAO invalidates the vertex buffer bindings. - gpu.dirty_flags.vertex_array.set(); + gpu.dirty.ResetVertexArrays(); state.draw.vertex_array = vao_entry.handle; return vao_entry.handle; @@ -176,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - if (gpu.dirty_flags.vertex_array.none()) + if (!gpu.dirty.vertex_array_buffers) return; + gpu.dirty.vertex_array_buffers = false; + + const auto& regs = gpu.regs; MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty_flags.vertex_array[index]) + if (!gpu.dirty.vertex_array[index]) continue; + gpu.dirty.vertex_array[index] = false; + gpu.dirty.vertex_instance[index] = false; const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) @@ -197,11 +216,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { ASSERT(end > start); const u64 size = end - start + 1; - const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size); + const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); // Bind the vertex array to the buffer at the current offset. - glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset, - vertex_array.stride); + vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset, + vertex_array.stride); if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { // Enable vertex buffer instancing with the specified divisor. @@ -211,11 +230,47 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { glVertexArrayBindingDivisor(vao, index, 0); } } +} + +void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { + auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.dirty.vertex_instances) + return; + gpu.dirty.vertex_instances = false; + + const auto& regs = gpu.regs; + // Upload all guest vertex arrays sequentially to our buffer + for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (!gpu.dirty.vertex_instance[index]) + continue; + + gpu.dirty.vertex_instance[index] = false; + + if (regs.instanced_arrays.IsInstancingEnabled(index) && + regs.vertex_array[index].divisor != 0) { + // Enable vertex buffer instancing with the specified divisor. + glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); + } else { + // Disable the vertex buffer instancing. + glVertexArrayBindingDivisor(vao, index, 0); + } + } +} - gpu.dirty_flags.vertex_array.reset(); +GLintptr RasterizerOpenGL::SetupIndexBuffer() { + if (accelerate_draw != AccelDraw::Indexed) { + return 0; + } + MICROPROFILE_SCOPE(OpenGL_Index); + const auto& regs = system.GPU().Maxwell3D().regs; + const std::size_t size = CalculateIndexBufferSize(); + const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + vertex_array_pushbuffer.SetIndexBuffer(buffer); + return offset; } -DrawParameters RasterizerOpenGL::SetupDraw() { +DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) { const auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; const bool is_indexed = accelerate_draw == AccelDraw::Indexed; @@ -227,11 +282,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() { params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); if (is_indexed) { - MICROPROFILE_SCOPE(OpenGL_Index); params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); params.count = regs.index_array.count; - params.index_buffer_offset = - buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize()); + params.index_buffer_offset = index_buffer_offset; params.base_vertex = static_cast<GLint>(regs.vb_element_base); } else { params.count = regs.vertex_buffer.count; @@ -247,10 +300,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { BaseBindings base_bindings; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; - // Prepare packed bindings - bind_ubo_pushbuffer.Setup(base_bindings.cbuf); - bind_ssbo_pushbuffer.Setup(base_bindings.gmem); - for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -271,18 +320,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { GLShader::MaxwellUniformData ubo{}; ubo.SetFromRegs(gpu, stage); - const GLintptr offset = + const auto [buffer, offset] = buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); // Bind the emulation info buffer - bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, - static_cast<GLsizeiptr>(sizeof(ubo))); + bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo))); Shader shader{shader_cache.GetStageProgram(program)}; - const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)}; + const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage); SetupDrawConstBuffers(stage_enum, shader); - SetupGlobalRegions(stage_enum, shader); + SetupDrawGlobalMemory(stage_enum, shader); const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)}; const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage}; @@ -321,12 +369,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { base_bindings = next_bindings; } - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - SyncClipEnabled(clip_distances); - gpu.dirty_flags.shaders = false; + gpu.dirty.shaders = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -409,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents, single_color_target}; - if (fb_config_state == current_framebuffer_config_state && - gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) { + if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) { // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or // single color targets). This is done because the guest registers may not change but the // host framebuffer may contain different attachments return current_depth_stencil_usage; } + gpu.dirty.render_settings = false; current_framebuffer_config_state = fb_config_state; texture_cache.GuardRenderTargets(true); @@ -504,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable}; } +void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb) { + auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + texture_cache.GuardRenderTargets(true); + View color_surface{}; + if (using_color_fb) { + color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false); + } + View depth_surface{}; + if (using_depth_fb || using_stencil_fb) { + depth_surface = texture_cache.GetDepthBufferSurface(false); + } + texture_cache.GuardRenderTargets(false); + + current_state.draw.draw_framebuffer = clear_framebuffer.handle; + current_state.ApplyFramebufferState(); + + if (color_surface) { + color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + } + + if (depth_surface) { + const auto& params = depth_surface->GetSurfaceParams(); + switch (params.type) { + case VideoCore::Surface::SurfaceType::Depth: { + depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + break; + } + case VideoCore::Surface::SurfaceType::DepthStencil: { + depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + break; + } + default: { UNIMPLEMENTED(); } + } + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + } +} + void RasterizerOpenGL::Clear() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& maxwell3d = system.GPU().Maxwell3D(); + + if (!maxwell3d.ShouldExecute()) { + return; + } + + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; - OpenGLState clear_state; + OpenGLState prev_state{OpenGLState::GetCurState()}; + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); + + OpenGLState clear_state{OpenGLState::GetCurState()}; + clear_state.SetDefaultViewports(); if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; @@ -530,6 +633,7 @@ void RasterizerOpenGL::Clear() { // true. clear_state.depth.test_enabled = true; clear_state.depth.test_func = GL_ALWAYS; + clear_state.depth.write_mask = GL_TRUE; } if (regs.clear_buffers.S) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); @@ -566,8 +670,9 @@ void RasterizerOpenGL::Clear() { return; } - const auto [clear_depth, clear_stencil] = ConfigureFramebuffers( - clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value()); + ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + + SyncViewport(clear_state); if (regs.clear_flags.scissor) { SyncScissorTest(clear_state); } @@ -576,21 +681,18 @@ void RasterizerOpenGL::Clear() { clear_state.EmulateViewportWithScissor(); } - clear_state.ApplyColorMask(); - clear_state.ApplyDepth(); - clear_state.ApplyStencilTest(); - clear_state.ApplyViewport(); - clear_state.ApplyFramebufferState(); + clear_state.AllDirty(); + clear_state.Apply(); if (use_color) { - glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); + glClearBufferfv(GL_COLOR, 0, regs.clear_color); } - if (clear_depth && clear_stencil) { + if (use_depth && use_stencil) { glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); - } else if (clear_depth) { + } else if (use_depth) { glClearBufferfv(GL_DEPTH, 0, ®s.clear_depth); - } else if (clear_stencil) { + } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } } @@ -601,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.ShouldExecute()) { + return; + } + const auto& regs = gpu.regs; SyncColorMask(); @@ -634,26 +741,47 @@ void RasterizerOpenGL::DrawArrays() { Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers - buffer_size += - Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - const bool invalidate = buffer_cache.Map(buffer_size); - if (invalidate) { - // As all cached buffers are invalidated, we need to recheck their state. - gpu.dirty_flags.vertex_array.set(); - } + // Prepare the vertex array. + buffer_cache.Map(buffer_size); + // Prepare vertex array format. const GLuint vao = SetupVertexFormat(); + vertex_array_pushbuffer.Setup(vao); + + // Upload vertex and index data. SetupVertexBuffer(vao); + SetupVertexInstances(vao); + const GLintptr index_buffer_offset = SetupIndexBuffer(); + + // Setup draw parameters. It will automatically choose what glDraw* method to use. + const DrawParameters params = SetupDraw(index_buffer_offset); + + // Prepare packed bindings. + bind_ubo_pushbuffer.Setup(0); + bind_ssbo_pushbuffer.Setup(0); - DrawParameters params = SetupDraw(); + // Setup shaders and their used resources. texture_cache.GuardSamplers(true); SetupShaders(params.primitive_mode); texture_cache.GuardSamplers(false); ConfigureFramebuffers(state); - buffer_cache.Unmap(); + // Signal the buffer cache that we are not going to upload more things. + const bool invalidate = buffer_cache.Unmap(); + + // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. + vertex_array_pushbuffer.Bind(); + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + + if (invalidate) { + // As all cached buffers are invalidated, we need to recheck their state. + gpu.dirty.ResetVertexArrays(); + } shader_program_manager->ApplyTo(state); state.Apply(); @@ -665,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() { params.DispatchDraw(); accelerate_draw = AccelDraw::Disabled; + gpu.dirty.memory_general = false; +} + +void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { + if (!GLAD_GL_ARB_compute_variable_group_size) { + LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the " + "lack of GL_ARB_compute_variable_group_size"); + return; + } + + auto kernel = shader_cache.GetComputeKernel(code_addr); + const auto [program, next_bindings] = kernel->GetProgramHandle({}); + state.draw.shader_program = program; + state.draw.program_pipeline = 0; + + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); + + bind_ubo_pushbuffer.Setup(0); + bind_ssbo_pushbuffer.Setup(0); + + SetupComputeConstBuffers(kernel); + SetupComputeGlobalMemory(kernel); + + // TODO(Rodrigo): Bind images and samplers + + buffer_cache.Unmap(); + + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + + state.ApplyShaderProgram(); + state.ApplyProgramPipeline(); + + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y, + launch_desc.grid_dim_z, launch_desc.block_dim_x, + launch_desc.block_dim_y, launch_desc.block_dim_z); } void RasterizerOpenGL::FlushAll() {} @@ -675,7 +843,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { return; } texture_cache.FlushRegion(addr, size); - global_cache.FlushRegion(addr, size); + buffer_cache.FlushRegion(addr, size); } void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { @@ -685,7 +853,6 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { } texture_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size); - global_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); } @@ -696,6 +863,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { InvalidateRegion(addr, size); } +void RasterizerOpenGL::TickFrame() { + buffer_cache.TickFrame(); +} + bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { @@ -737,14 +908,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto stage_index = static_cast<std::size_t>(stage); - const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index]; - const auto& entries = shader->GetShaderEntries().const_buffers; + const auto& stages = system.GPU().Maxwell3D().state.shader_stages; + const auto& shader_stage = stages[static_cast<std::size_t>(stage)]; + for (const auto& entry : shader->GetShaderEntries().const_buffers) { + const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); + } +} - // Upload only the enabled buffers from the 16 constbuffers of each shader stage - for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { - const auto& entry = entries[bindpoint]; - SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry); +void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { + MICROPROFILE_SCOPE(OpenGL_UBO); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + for (const auto& entry : kernel->GetShaderEntries().const_buffers) { + const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; + const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value(); + Tegra::Engines::ConstBufferInfo buffer; + buffer.address = config.Address(); + buffer.size = config.size; + buffer.enabled = mask[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); } } @@ -752,49 +934,52 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b const GLShader::ConstBufferEntry& entry) { if (!buffer.enabled) { // Set values to zero to unbind buffers - bind_ubo_pushbuffer.Push(0, 0, 0); + bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); return; } - std::size_t size; - if (entry.IsIndirect()) { - // Buffer is accessed indirectly, so upload the entire thing - size = buffer.size; - - if (size > MaxConstbufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size, - MaxConstbufferSize); - size = MaxConstbufferSize; - } - } else { - // Buffer is accessed directly, upload just what we use - size = entry.GetSize(); - } - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 // UBO alignment requirements. - size = Common::AlignUp(size, sizeof(GLvec4)); - ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big"); + const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const std::size_t alignment = device.GetUniformBufferAlignment(); - const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment); - bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size); + const auto alignment = device.GetUniformBufferAlignment(); + const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); + bind_ubo_pushbuffer.Push(cbuf, offset, size); } -void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader) { - const auto& entries = shader->GetShaderEntries().global_memory_entries; - for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { - const auto& entry{entries[bindpoint]}; - const auto& region{global_cache.GetGlobalRegion(entry, stage)}; - if (entry.IsWritten()) { - region->MarkAsModified(true, global_cache); - } - bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, - static_cast<GLsizeiptr>(region->GetSizeInBytes())); +void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader) { + auto& gpu{system.GPU()}; + auto& memory_manager{gpu.MemoryManager()}; + const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; + for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { + const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); + } +} + +void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { + auto& gpu{system.GPU()}; + auto& memory_manager{gpu.MemoryManager()}; + const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; + for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { + const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); } } +void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, + GPUVAddr gpu_addr, std::size_t size) { + const auto alignment{device.GetShaderStorageBufferAlignment()}; + const auto [ssbo, buffer_offset] = + buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten()); + bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); +} + TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader, BaseBindings base_bindings) { MICROPROFILE_SCOPE(OpenGL_Texture); @@ -883,10 +1068,11 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); - state.cull.enabled = regs.cull.enabled != 0; + const auto& regs = maxwell3d.regs; + state.cull.enabled = regs.cull.enabled != 0; if (state.cull.enabled) { state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face); state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); @@ -919,16 +1105,21 @@ void RasterizerOpenGL::SyncDepthTestState() { state.depth.test_enabled = regs.depth_test_enable != 0; state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; - if (!state.depth.test_enabled) + if (!state.depth.test_enabled) { return; + } state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func); } void RasterizerOpenGL::SyncStencilTestState() { - const auto& regs = system.GPU().Maxwell3D().regs; - state.stencil.test_enabled = regs.stencil_enable != 0; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.stencil_test) { + return; + } + const auto& regs = maxwell3d.regs; + state.stencil.test_enabled = regs.stencil_enable != 0; if (!regs.stencil_enable) { return; } @@ -957,10 +1148,17 @@ void RasterizerOpenGL::SyncStencilTestState() { state.stencil.back.action_depth_fail = GL_KEEP; state.stencil.back.action_depth_pass = GL_KEEP; } + state.MarkDirtyStencilState(); + maxwell3d.dirty.stencil_test = false; } void RasterizerOpenGL::SyncColorMask() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.color_mask) { + return; + } + const auto& regs = maxwell3d.regs; + const std::size_t count = regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; for (std::size_t i = 0; i < count; i++) { @@ -971,6 +1169,9 @@ void RasterizerOpenGL::SyncColorMask() { dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; } + + state.MarkDirtyColorMask(); + maxwell3d.dirty.color_mask = false; } void RasterizerOpenGL::SyncMultiSampleState() { @@ -985,7 +1186,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() { } void RasterizerOpenGL::SyncBlendState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.blend_state) { + return; + } + const auto& regs = maxwell3d.regs; state.blend_color.red = regs.blend_color.r; state.blend_color.green = regs.blend_color.g; @@ -1008,6 +1213,8 @@ void RasterizerOpenGL::SyncBlendState() { for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { state.blend[i].enabled = false; } + maxwell3d.dirty.blend_state = false; + state.MarkDirtyBlendState(); return; } @@ -1024,6 +1231,9 @@ void RasterizerOpenGL::SyncBlendState() { blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); } + + state.MarkDirtyBlendState(); + maxwell3d.dirty.blend_state = false; } void RasterizerOpenGL::SyncLogicOpState() { @@ -1075,13 +1285,21 @@ void RasterizerOpenGL::SyncPointState() { } void RasterizerOpenGL::SyncPolygonOffset() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.polygon_offset) { + return; + } + const auto& regs = maxwell3d.regs; + state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; state.polygon_offset.units = regs.polygon_offset_units; state.polygon_offset.factor = regs.polygon_offset_factor; state.polygon_offset.clamp = regs.polygon_offset_clamp; + + state.MarkDirtyPolygonOffset(); + maxwell3d.dirty.polygon_offset = false; } void RasterizerOpenGL::SyncAlphaTest() { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index bf67e3a70..8b123c48d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -24,7 +24,6 @@ #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" -#include "video_core/renderer_opengl/gl_global_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" @@ -59,10 +58,12 @@ public: void DrawArrays() override; void Clear() override; + void DispatchCompute(GPUVAddr code_addr) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; + void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; @@ -73,11 +74,6 @@ public: void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; - /// Maximum supported size that a constbuffer can have in bytes. - static constexpr std::size_t MaxConstbufferSize = 0x10000; - static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, - "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); - private: struct FramebufferConfigState { bool using_color_fb{}; @@ -98,30 +94,45 @@ private: /** * Configures the color and depth framebuffer states. - * @param must_reconfigure If true, tells the framebuffer to skip the cache and reconfigure - * again. Used by the texture cache to solve texception conflicts - * @param use_color_fb If true, configure color framebuffers. - * @param using_depth_fb If true, configure the depth/stencil framebuffer. - * @param preserve_contents If true, tries to preserve data from a previously used framebuffer. + * + * @param current_state The current OpenGL state. + * @param using_color_fb If true, configure color framebuffers. + * @param using_depth_fb If true, configure the depth/stencil framebuffer. + * @param preserve_contents If true, tries to preserve data from a previously used + * framebuffer. * @param single_color_target Specifies if a single color buffer target should be used. + * * @returns If depth (first) or stencil (second) are being stored in the bound zeta texture - * (requires using_depth_fb to be true) + * (requires using_depth_fb to be true) */ std::pair<bool, bool> ConfigureFramebuffers( - OpenGLState& current_state, bool use_color_fb = true, bool using_depth_fb = true, + OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true, bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); + void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb); + /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader); + /// Configures the current constbuffers to use for the kernel invocation. + void SetupComputeConstBuffers(const Shader& kernel); + /// Configures a constant buffer. void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer, const GLShader::ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. - void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader); + void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader); + + /// Configures the current global memory entries to use for the kernel invocation. + void SetupComputeGlobalMemory(const Shader& kernel); + + /// Configures a constant buffer. + void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, + std::size_t size); /// Configures the current textures to use for the draw command. Returns shaders texture buffer /// usage. @@ -189,7 +200,6 @@ private: TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; - GlobalRegionCacheOpenGL global_cache; SamplerCacheOpenGL sampler_cache; FramebufferCacheOpenGL framebuffer_cache; @@ -208,6 +218,7 @@ private: static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; + VertexArrayPushBuffer vertex_array_pushbuffer; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; @@ -219,14 +230,19 @@ private: GLuint SetupVertexFormat(); void SetupVertexBuffer(GLuint vao); + void SetupVertexInstances(GLuint vao); - DrawParameters SetupDraw(); + GLintptr SetupIndexBuffer(); + + DrawParameters SetupDraw(GLintptr index_buffer_offset); void SetupShaders(GLenum primitive_mode); enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; + OGLFramebuffer clear_framebuffer; + using CachedPageMap = boost::icl::interval_map<u64, int>; CachedPageMap cached_pages; }; diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h index defbc2d81..34ee37f00 100644 --- a/src/video_core/renderer_opengl/gl_sampler_cache.h +++ b/src/video_core/renderer_opengl/gl_sampler_cache.h @@ -17,9 +17,9 @@ public: ~SamplerCacheOpenGL(); protected: - OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const; + OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - GLuint ToSamplerType(const OGLSampler& sampler) const; + GLuint ToSamplerType(const OGLSampler& sampler) const override; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index f9b2b03a0..1c90facc3 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -23,13 +23,13 @@ namespace OpenGL { using VideoCommon::Shader::ProgramCode; -// One UBO is always reserved for emulation values -constexpr u32 RESERVED_UBOS = 1; +// One UBO is always reserved for emulation values on staged shaders +constexpr u32 STAGE_RESERVED_UBOS = 1; struct UnspecializedShader { std::string code; GLShader::ShaderEntries entries; - Maxwell::ShaderProgram program_type; + ProgramType program_type; }; namespace { @@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g } /// Gets the shader type from a Maxwell program type -constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) { +constexpr GLenum GetShaderType(ProgramType program_type) { switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GL_VERTEX_SHADER; - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GL_GEOMETRY_SHADER; - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GL_FRAGMENT_SHADER; + case ProgramType::Compute: + return GL_COMPUTE_SHADER; default: return GL_NONE; } @@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen } } +ProgramType GetProgramType(Maxwell::ShaderProgram program) { + switch (program) { + case Maxwell::ShaderProgram::VertexA: + return ProgramType::VertexA; + case Maxwell::ShaderProgram::VertexB: + return ProgramType::VertexB; + case Maxwell::ShaderProgram::TesselationControl: + return ProgramType::TessellationControl; + case Maxwell::ShaderProgram::TesselationEval: + return ProgramType::TessellationEval; + case Maxwell::ShaderProgram::Geometry: + return ProgramType::Geometry; + case Maxwell::ShaderProgram::Fragment: + return ProgramType::Fragment; + } + UNREACHABLE(); + return {}; +} + /// Calculates the size of a program stream std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { constexpr std::size_t start_offset = 10; @@ -128,11 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { } /// Hashes one (or two) program streams -u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code, - const ProgramCode& code_b) { - u64 unique_identifier = - Common::CityHash64(reinterpret_cast<const char*>(code.data()), CalculateProgramSize(code)); - if (program_type != Maxwell::ShaderProgram::VertexA) { +u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code, + const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) { + if (size_a == 0) { + size_a = CalculateProgramSize(code); + } + u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a); + if (program_type != ProgramType::VertexA) { return unique_identifier; } // VertexA programs include two programs @@ -140,50 +163,67 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& std::size_t seed = 0; boost::hash_combine(seed, unique_identifier); - const u64 identifier_b = Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), - CalculateProgramSize(code_b)); + if (size_b == 0) { + size_b = CalculateProgramSize(code_b); + } + const u64 identifier_b = + Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b); boost::hash_combine(seed, identifier_b); return static_cast<u64>(seed); } /// Creates an unspecialized program from code streams -GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type, +GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type, ProgramCode program_code, ProgramCode program_code_b) { GLShader::ShaderSetup setup(program_code); - if (program_type == Maxwell::ShaderProgram::VertexA) { + setup.program.size_a = CalculateProgramSize(program_code); + setup.program.size_b = 0; + if (program_type == ProgramType::VertexA) { // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. // Conventional HW does not support this, so we combine VertexA and VertexB into one // stage here. setup.SetProgramB(program_code_b); + setup.program.size_b = CalculateProgramSize(program_code_b); } - setup.program.unique_identifier = - GetUniqueIdentifier(program_type, program_code, program_code_b); + setup.program.unique_identifier = GetUniqueIdentifier( + program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b); switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GLShader::GenerateVertexShader(device, setup); - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GLShader::GenerateGeometryShader(device, setup); - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GLShader::GenerateFragmentShader(device, setup); + case ProgramType::Compute: + return GLShader::GenerateComputeShader(device, setup); default: - LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type)); return {}; } } CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries, - Maxwell::ShaderProgram program_type, const ProgramVariant& variant, + ProgramType program_type, const ProgramVariant& variant, bool hint_retrievable = false) { auto base_bindings{variant.base_bindings}; const auto primitive_mode{variant.primitive_mode}; const auto texture_buffer_usage{variant.texture_buffer_usage}; std::string source = "#version 430 core\n" - "#extension GL_ARB_separate_shader_objects : enable\n\n"; - source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + "#extension GL_ARB_separate_shader_objects : enable\n"; + if (entries.shader_viewport_layer_array) { + source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; + } + if (program_type == ProgramType::Compute) { + source += "#extension GL_ARB_compute_variable_group_size : require\n"; + } + source += '\n'; + + if (program_type != ProgramType::Compute) { + source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + } for (const auto& cbuf : entries.const_buffers) { source += @@ -210,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i); } - if (program_type == Maxwell::ShaderProgram::Geometry) { + if (program_type == ProgramType::Geometry) { const auto [glsl_topology, debug_name, max_vertices] = GetPrimitiveDescription(primitive_mode); source += "layout (" + std::string(glsl_topology) + ") in;\n"; source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; } + if (program_type == ProgramType::Compute) { + source += "layout (local_size_variable) in;\n"; + } source += code; @@ -244,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type, +CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type, GLShader::ProgramResult result) : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier}, program_type{program_type}, @@ -257,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, ProgramCode&& program_code_b) { const auto code_size{CalculateProgramSize(program_code)}; const auto code_size_b{CalculateProgramSize(program_code_b)}; - auto result{CreateProgram(params.device, program_type, program_code, program_code_b)}; + auto result{ + CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)}; if (result.first.empty()) { // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now return {}; } params.disk_cache.SaveRaw(ShaderDiskCacheRaw( - params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)), - static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code), - std::move(program_code_b))); + params.unique_identifier, GetProgramType(program_type), + static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)), + std::move(program_code), std::move(program_code_b))); - return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result))); + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); } Shader CachedShader::CreateStageFromCache(const ShaderParameters& params, Maxwell::ShaderProgram program_type, GLShader::ProgramResult result) { - return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result))); + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); +} + +Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) { + auto result{CreateProgram(params.device, ProgramType::Compute, code, {})}; + + const auto code_size{CalculateProgramSize(code)}; + params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute, + static_cast<u32>(code_size / sizeof(u64)), 0, + std::move(code), {})); + + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); +} + +Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result) { + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); } std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) { GLuint handle{}; - if (program_type == Maxwell::ShaderProgram::Geometry) { + if (program_type == ProgramType::Geometry) { handle = GetGeometryShader(variant); } else { const auto [entry, is_cache_miss] = programs.try_emplace(variant); @@ -297,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar handle = program->handle; } - auto base_bindings{variant.base_bindings}; - base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS; + auto base_bindings = variant.base_bindings; + base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()); + if (program_type != ProgramType::Compute) { + base_bindings.cbuf += STAGE_RESERVED_UBOS; + } base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size()); base_bindings.sampler += static_cast<u32>(entries.samplers.size()); @@ -561,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty_flags.shaders) { + if (!system.GPU().Maxwell3D().dirty.shaders) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -578,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // No shader found - create a new one ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; ProgramCode program_code_b; - if (program == Maxwell::ShaderProgram::VertexA) { + const bool is_program_a{program == Maxwell::ShaderProgram::VertexA}; + if (is_program_a) { const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; program_code_b = GetShaderCode(memory_manager, program_addr_b, memory_manager.GetPointer(program_addr_b)); } - const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); + const auto unique_identifier = + GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b); const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, host_ptr, unique_identifier}; @@ -601,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { return last_shaders[static_cast<std::size_t>(program)] = shader; } +Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { + auto& memory_manager{system.GPU().MemoryManager()}; + const auto host_ptr{memory_manager.GetPointer(code_addr)}; + auto kernel = TryGet(host_ptr); + if (kernel) { + return kernel; + } + + // No kernel found - create a new one + auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; + const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})}; + const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; + const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, + host_ptr, unique_identifier}; + + const auto found = precompiled_shaders.find(unique_identifier); + if (found == precompiled_shaders.end()) { + kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + } else { + kernel = CachedShader::CreateKernelFromCache(params, found->second); + } + + Register(kernel); + return kernel; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index bbb53cdf4..a3106a0ff 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -61,6 +61,11 @@ public: Maxwell::ShaderProgram program_type, GLShader::ProgramResult result); + static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code); + + static Shader CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result); + VAddr GetCpuAddr() const override { return cpu_addr; } @@ -78,7 +83,7 @@ public: std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant); private: - explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type, + explicit CachedShader(const ShaderParameters& params, ProgramType program_type, GLShader::ProgramResult result); // Geometry programs. These are needed because GLSL needs an input topology but it's not @@ -104,7 +109,7 @@ private: u8* host_ptr{}; VAddr cpu_addr{}; u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; ShaderDiskCacheOpenGL& disk_cache; const PrecompiledPrograms& precompiled_programs; @@ -132,6 +137,9 @@ public: /// Gets the current specified shader stage program Shader GetStageProgram(Maxwell::ShaderProgram program); + /// Gets a compute kernel in the passed address + Shader GetComputeKernel(GPUVAddr code_addr); + protected: // We do not have to flush this cache as things in it are never modified by us. void FlushObjectInner(const Shader& object) override {} diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 5f2f1510c..ffe26b241 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -14,6 +14,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_rasterizer.h" @@ -36,7 +37,6 @@ using namespace std::string_literals; using namespace VideoCommon::Shader; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage; using Operation = const OperationNode&; enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; @@ -46,7 +46,7 @@ using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureAoffi, TextureArgument>; constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); + static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); class ShaderWriter { public: @@ -161,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +constexpr bool IsVertexShader(ProgramType stage) { + return stage == ProgramType::VertexA || stage == ProgramType::VertexB; +} + class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage, std::string suffix) : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} @@ -191,10 +195,12 @@ public: // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems // unlikely that shaders will use 20 nested SSYs and PBKs. - constexpr u32 FLOW_STACK_SIZE = 20; - for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { - code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); - code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + if (!ir.IsFlowStackDisabled()) { + constexpr u32 FLOW_STACK_SIZE = 20; + for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { + code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); + code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + } } code.AddLine("while (true) {{"); @@ -244,24 +250,22 @@ public: usage.is_read, usage.is_written); } entries.clip_distances = ir.GetClipDistances(); + entries.shader_viewport_layer_array = + IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex()); entries.shader_length = ir.GetLength(); return entries; } private: - using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation); - using OperationDecompilersArray = - std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; - void DeclareVertex() { - if (stage != ShaderStage::Vertex) + if (!IsVertexShader(stage)) return; DeclareVertexRedeclarations(); } void DeclareGeometry() { - if (stage != ShaderStage::Geometry) { + if (stage != ProgramType::Geometry) { return; } @@ -280,22 +284,35 @@ private: } void DeclareVertexRedeclarations() { - bool clip_distances_declared = false; - code.AddLine("out gl_PerVertex {{"); ++code.scope; code.AddLine("vec4 gl_Position;"); - for (const auto o : ir.GetOutputAttributes()) { - if (o == Attribute::Index::PointSize) - code.AddLine("float gl_PointSize;"); - if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 || - o == Attribute::Index::ClipDistances4567)) { + for (const auto attribute : ir.GetOutputAttributes()) { + if (attribute == Attribute::Index::ClipDistances0123 || + attribute == Attribute::Index::ClipDistances4567) { code.AddLine("float gl_ClipDistance[];"); - clip_distances_declared = true; + break; } } + if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { + if (ir.UsesLayer()) { + code.AddLine("int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("int gl_ViewportIndex;"); + } + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && + !device.HasVertexViewportLayer()) { + LOG_ERROR( + Render_OpenGL, + "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); + } + + if (ir.UsesPointSize()) { + code.AddLine("float gl_PointSize;"); + } --code.scope; code.AddLine("}};"); @@ -323,11 +340,16 @@ private: } void DeclareLocalMemory() { - if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) { - const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; - code.AddLine("float {}[{}];", GetLocalMemory(), element_count); - code.AddNewLine(); + // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at + // specialization time. + const u64 local_memory_size = + stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize(); + if (local_memory_size == 0) { + return; } + const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; + code.AddLine("float {}[{}];", GetLocalMemory(), element_count); + code.AddNewLine(); } void DeclareInternalFlags() { @@ -381,12 +403,12 @@ private: const u32 location{GetGenericAttributeIndex(index)}; std::string name{GetInputAttribute(index)}; - if (stage == ShaderStage::Geometry) { + if (stage == ProgramType::Geometry) { name = "gs_" + name + "[]"; } std::string suffix; - if (stage == ShaderStage::Fragment) { + if (stage == ProgramType::Fragment) { const auto input_mode{header.ps.GetAttributeUse(location)}; if (skip_unused && input_mode == AttributeUse::Unused) { return; @@ -398,7 +420,7 @@ private: } void DeclareOutputAttributes() { - if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) { + if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) { for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) { DeclareOutputAttribute(ToGenericAttribute(i)); } @@ -520,7 +542,7 @@ private: constexpr u32 element_stride{4}; const u32 address{generic_base + index * generic_stride + element * element_stride}; - const bool declared{stage != ShaderStage::Fragment || + const bool declared{stage != ProgramType::Fragment || header.ps.GetAttributeUse(index) != AttributeUse::Unused}; const std::string value{declared ? ReadAttribute(attribute, element) : "0"}; code.AddLine("case 0x{:x}: return {};", address, value); @@ -624,7 +646,7 @@ private: } if (const auto abuf = std::get_if<AbufNode>(&*node)) { - UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry, + UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry, "Physical attributes in geometry shaders are not implemented"); if (abuf->IsPhysicalBuffer()) { return fmt::format("readPhysicalAttribute(ftou({}))", @@ -679,6 +701,9 @@ private: } if (const auto lmem = std::get_if<LmemNode>(&*node)) { + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); } @@ -708,7 +733,7 @@ private: std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) { const auto GeometryPass = [&](std::string_view name) { - if (stage == ShaderStage::Geometry && buffer) { + if (stage == ProgramType::Geometry && buffer) { // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games // set an 0x80000000 index for those and the shader fails to build. Find out why // this happens and what's its intent. @@ -720,10 +745,10 @@ private: switch (attribute) { case Attribute::Index::Position: switch (stage) { - case ShaderStage::Geometry: + case ProgramType::Geometry: return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer), GetSwizzle(element)); - case ShaderStage::Fragment: + case ProgramType::Fragment: return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)); default: UNREACHABLE(); @@ -744,7 +769,7 @@ private: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. - ASSERT(stage == ShaderStage::Vertex); + ASSERT(IsVertexShader(stage)); switch (element) { case 2: // Config pack's first value is instance_id. @@ -756,7 +781,7 @@ private: return "0"; case Attribute::Index::FrontFacing: // TODO(Subv): Find out what the values are for the other elements. - ASSERT(stage == ShaderStage::Fragment); + ASSERT(stage == ProgramType::Fragment); switch (element) { case 3: return "itof(gl_FrontFacing ? -1 : 0)"; @@ -778,7 +803,7 @@ private: return value; } // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders - const std::string precise = stage != ShaderStage::Fragment ? "precise " : ""; + const std::string precise = stage != ProgramType::Fragment ? "precise " : ""; const std::string temporary = code.GenerateTemporary(); code.AddLine("{}float {} = {};", precise, temporary, value); @@ -803,6 +828,45 @@ private: return CastOperand(VisitOperand(operation, operand_index), type); } + std::optional<std::pair<std::string, bool>> GetOutputAttribute(const AbufNode* abuf) { + switch (const auto attribute = abuf->GetIndex()) { + case Attribute::Index::Position: + return std::make_pair("gl_Position"s + GetSwizzle(abuf->GetElement()), false); + case Attribute::Index::LayerViewportPointSize: + switch (abuf->GetElement()) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + return {}; + } + return std::make_pair("gl_Layer", true); + case 2: + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + return {}; + } + return std::make_pair("gl_ViewportIndex", true); + case 3: + UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); + return std::make_pair("gl_PointSize", false); + } + return {}; + case Attribute::Index::ClipDistances0123: + return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), false); + case Attribute::Index::ClipDistances4567: + return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), + false); + default: + if (IsGenericAttribute(attribute)) { + return std::make_pair( + GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), false); + } + UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); + return {}; + } + } + std::string CastOperand(const std::string& value, Type type) const { switch (type) { case Type::Bool: @@ -999,6 +1063,8 @@ private: const Node& src = operation[1]; std::string target; + bool is_integer = false; + if (const auto gpr = std::get_if<GprNode>(&*dest)) { if (gpr->GetIndex() == Register::ZeroIndex) { // Writing to Register::ZeroIndex is a no op @@ -1007,27 +1073,16 @@ private: target = GetRegister(gpr->GetIndex()); } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer()); - - target = [&]() -> std::string { - switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) { - case Attribute::Index::Position: - return "gl_Position"s + GetSwizzle(abuf->GetElement()); - case Attribute::Index::PointSize: - return "gl_PointSize"; - case Attribute::Index::ClipDistances0123: - return fmt::format("gl_ClipDistance[{}]", abuf->GetElement()); - case Attribute::Index::ClipDistances4567: - return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4); - default: - if (IsGenericAttribute(attribute)) { - return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()); - } - UNIMPLEMENTED_MSG("Unhandled output attribute: {}", - static_cast<u32>(attribute)); - return "0"; - } - }(); + const auto result = GetOutputAttribute(abuf); + if (!result) { + return {}; + } + target = result->first; + is_integer = result->second; } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { const std::string real = Visit(gmem->GetRealAddress()); @@ -1038,7 +1093,11 @@ private: UNREACHABLE_MSG("Assign called without a proper target"); } - code.AddLine("{} = {};", target, Visit(src)); + if (is_integer) { + code.AddLine("{} = ftoi({});", target, Visit(src)); + } else { + code.AddLine("{} = {};", target, Visit(src)); + } return {}; } @@ -1351,14 +1410,10 @@ private: return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint)); } - std::string LogicalAll2(Operation operation) { + std::string LogicalAnd2(Operation operation) { return GenerateUnary(operation, "all", Type::Bool, Type::Bool2); } - std::string LogicalAny2(Operation operation) { - return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); - } - template <bool with_nan> std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) { const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, @@ -1555,6 +1610,14 @@ private: return {}; } + std::string BranchIndirect(Operation operation) { + const std::string op_a = VisitOperand(operation, 0, Type::Uint); + + code.AddLine("jmp_to = {};", op_a); + code.AddLine("break;"); + return {}; + } + std::string PushFlowStack(Operation operation) { const auto stack = std::get<MetaStackClass>(operation.GetMeta()); const auto target = std::get_if<ImmediateNode>(&*operation[0]); @@ -1573,7 +1636,7 @@ private: } std::string Exit(Operation operation) { - if (stage != ShaderStage::Fragment) { + if (stage != ProgramType::Fragment) { code.AddLine("return;"); return {}; } @@ -1624,7 +1687,7 @@ private: } std::string EmitVertex(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + ASSERT_MSG(stage == ProgramType::Geometry, "EmitVertex is expected to be used in a geometry shader."); // If a geometry shader is attached, it will always flip (it's the last stage before @@ -1635,7 +1698,7 @@ private: } std::string EndPrimitive(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + ASSERT_MSG(stage == ProgramType::Geometry, "EndPrimitive is expected to be used in a geometry shader."); code.AddLine("EndPrimitive();"); @@ -1657,7 +1720,7 @@ private: return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; } - static constexpr OperationDecompilersArray operation_decompilers = { + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, &GLSLDecompiler::Select, @@ -1741,8 +1804,7 @@ private: &GLSLDecompiler::LogicalXor, &GLSLDecompiler::LogicalNegate, &GLSLDecompiler::LogicalPick2, - &GLSLDecompiler::LogicalAll2, - &GLSLDecompiler::LogicalAny2, + &GLSLDecompiler::LogicalAnd2, &GLSLDecompiler::LogicalLessThan<Type::Float>, &GLSLDecompiler::LogicalEqual<Type::Float>, @@ -1789,6 +1851,7 @@ private: &GLSLDecompiler::ImageStore, &GLSLDecompiler::Branch, + &GLSLDecompiler::BranchIndirect, &GLSLDecompiler::PushFlowStack, &GLSLDecompiler::PopFlowStack, &GLSLDecompiler::Exit, @@ -1805,6 +1868,7 @@ private: &GLSLDecompiler::WorkGroupId<1>, &GLSLDecompiler::WorkGroupId<2>, }; + static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { return GetDeclarationWithSuffix(index, "gpr"); @@ -1869,7 +1933,7 @@ private: } u32 GetNumPhysicalInputAttributes() const { - return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); + return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); } u32 GetNumPhysicalAttributes() const { @@ -1882,7 +1946,7 @@ private: const Device& device; const ShaderIR& ir; - const ShaderStage stage; + const ProgramType stage; const std::string suffix; const Header header; @@ -1913,7 +1977,7 @@ std::string GetCommonDeclarations() { MAX_CONSTBUFFER_ELEMENTS); } -ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage, +ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage, const std::string& suffix) { GLSLDecompiler decompiler(device, ir, stage, suffix); decompiler.Decompile(); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 14d11c7fc..2ea02f5bf 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -12,14 +12,26 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/shader/shader_ir.h" -namespace OpenGL { -class Device; -} - namespace VideoCommon::Shader { class ShaderIR; } +namespace OpenGL { + +class Device; + +enum class ProgramType : u32 { + VertexA = 0, + VertexB = 1, + TessellationControl = 2, + TessellationEval = 3, + Geometry = 4, + Fragment = 5, + Compute = 6 +}; + +} // namespace OpenGL + namespace OpenGL::GLShader { struct ShaderEntries; @@ -78,12 +90,13 @@ struct ShaderEntries { std::vector<ImageEntry> images; std::vector<GlobalMemoryEntry> global_memory_entries; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + bool shader_viewport_layer_array{}; std::size_t shader_length{}; }; std::string GetCommonDeclarations(); ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, - Maxwell::ShaderStage stage, const std::string& suffix); + ProgramType stage, const std::string& suffix); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 10688397b..969fe9ced 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { } // namespace -ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, +ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b) : unique_identifier{unique_identifier}, program_type{program_type}, @@ -373,6 +373,12 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn } } + bool shader_viewport_layer_array{}; + if (!LoadObjectFromPrecompiled(shader_viewport_layer_array)) { + return {}; + } + entry.entries.shader_viewport_layer_array = shader_viewport_layer_array; + u64 shader_length{}; if (!LoadObjectFromPrecompiled(shader_length)) { return {}; @@ -445,6 +451,10 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std: } } + if (!SaveObjectToPrecompiled(entries.shader_viewport_layer_array)) { + return false; + } + if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) { return false; } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 4f296dda6..cc8bbd61e 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -18,7 +18,6 @@ #include "common/assert.h" #include "common/common_types.h" #include "core/file_sys/vfs_vector.h" -#include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_gen.h" namespace Core { @@ -34,14 +33,11 @@ namespace OpenGL { struct ShaderDiskCacheUsage; struct ShaderDiskCacheDump; -using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; - using ProgramCode = std::vector<u64>; -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - +using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; using TextureBufferUsage = std::bitset<64>; -/// Allocated bindings used by an OpenGL shader program. +/// Allocated bindings used by an OpenGL shader program struct BaseBindings { u32 cbuf{}; u32 gmem{}; @@ -126,7 +122,7 @@ namespace OpenGL { /// Describes a shader how it's used by the guest GPU class ShaderDiskCacheRaw { public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, + explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b); ShaderDiskCacheRaw(); @@ -141,30 +137,13 @@ public: } bool HasProgramA() const { - return program_type == Maxwell::ShaderProgram::VertexA; + return program_type == ProgramType::VertexA; } - Maxwell::ShaderProgram GetProgramType() const { + ProgramType GetProgramType() const { return program_type; } - Maxwell::ShaderStage GetProgramStage() const { - switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: - return Maxwell::ShaderStage::Vertex; - case Maxwell::ShaderProgram::TesselationControl: - return Maxwell::ShaderStage::TesselationControl; - case Maxwell::ShaderProgram::TesselationEval: - return Maxwell::ShaderStage::TesselationEval; - case Maxwell::ShaderProgram::Geometry: - return Maxwell::ShaderStage::Geometry; - case Maxwell::ShaderProgram::Fragment: - return Maxwell::ShaderStage::Fragment; - } - UNREACHABLE(); - } - const ProgramCode& GetProgramCode() const { return program_code; } @@ -175,7 +154,7 @@ public: private: u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; u32 program_code_size{}; u32 program_code_size_b{}; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 9148629ec..3a8d9e1da 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D; using VideoCommon::Shader::ProgramCode; using VideoCommon::Shader::ShaderIR; -static constexpr u32 PROGRAM_OFFSET{10}; +static constexpr u32 PROGRAM_OFFSET = 10; +static constexpr u32 COMPUTE_OFFSET = 0; ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); @@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB; + ProgramResult program = Decompile(device, program_ir, stage, "vertex"); out += program.first; if (setup.IsDualProgram()) { - const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET); - ProgramResult program_b = - Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); - + const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b); + ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b"); out += program_b.first; } @@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); + + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry"); out += program.first; out += R"( @@ -115,10 +114,8 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); - + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment"); out += program.first; out += R"( @@ -130,4 +127,22 @@ void main() { return {std::move(out), std::move(program.second)}; } +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) { + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + + std::string out = "// Shader Unique Id: CS" + id + "\n\n"; + out += GetCommonDeclarations(); + + const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute"); + out += program.first; + + out += R"( +void main() { + execute_compute(); +} +)"; + return {std::move(out), std::move(program.second)}; +} + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index 0536c8a03..3833e88ab 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -27,6 +27,8 @@ struct ShaderSetup { ProgramCode code; ProgramCode code_b; // Used for dual vertex shaders u64 unique_identifier; + std::size_t size_a; + std::size_t size_b; } program; /// Used in scenarios where we have a dual vertex shaders @@ -52,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se /// Generates the GLSL fragment shader program source code for the given FS program ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup); +/// Generates the GLSL compute shader program source code for the given CS program +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup); + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 5f3fe067e..9e74eda0d 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -10,21 +10,25 @@ namespace OpenGL::GLShader { -GLuint LoadShader(const char* source, GLenum type) { - const char* debug_type; +namespace { +const char* GetStageDebugName(GLenum type) { switch (type) { case GL_VERTEX_SHADER: - debug_type = "vertex"; - break; + return "vertex"; case GL_GEOMETRY_SHADER: - debug_type = "geometry"; - break; + return "geometry"; case GL_FRAGMENT_SHADER: - debug_type = "fragment"; - break; - default: - UNREACHABLE(); + return "fragment"; + case GL_COMPUTE_SHADER: + return "compute"; } + UNIMPLEMENTED(); + return "unknown"; +} +} // Anonymous namespace + +GLuint LoadShader(const char* source, GLenum type) { + const char* debug_type = GetStageDebugName(type); const GLuint shader_id = glCreateShader(type); glShaderSource(shader_id, 1, &source, nullptr); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index d86e137ac..f4777d0b0 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -6,8 +6,11 @@ #include <glad/glad.h> #include "common/assert.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "video_core/renderer_opengl/gl_state.h" +MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128)); + namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; @@ -162,6 +165,25 @@ OpenGLState::OpenGLState() { alpha_test.ref = 0.0f; } +void OpenGLState::SetDefaultViewports() { + for (auto& item : viewports) { + item.x = 0; + item.y = 0; + item.width = 0; + item.height = 0; + item.depth_range_near = 0.0f; + item.depth_range_far = 1.0f; + item.scissor.enabled = false; + item.scissor.x = 0; + item.scissor.y = 0; + item.scissor.width = 0; + item.scissor.height = 0; + } + + depth_clamp.far_plane = false; + depth_clamp.near_plane = false; +} + void OpenGLState::ApplyDefaultState() { glEnable(GL_BLEND); glDisable(GL_FRAMEBUFFER_SRGB); @@ -523,7 +545,8 @@ void OpenGLState::ApplySamplers() const { } } -void OpenGLState::Apply() const { +void OpenGLState::Apply() { + MICROPROFILE_SCOPE(OpenGL_State); ApplyFramebufferState(); ApplyVertexArrayState(); ApplyShaderProgram(); @@ -532,19 +555,31 @@ void OpenGLState::Apply() const { ApplyPointSize(); ApplyFragmentColorClamp(); ApplyMultisample(); + if (dirty.color_mask) { + ApplyColorMask(); + dirty.color_mask = false; + } ApplyDepthClamp(); - ApplyColorMask(); ApplyViewport(); - ApplyStencilTest(); + if (dirty.stencil_state) { + ApplyStencilTest(); + dirty.stencil_state = false; + } ApplySRgb(); ApplyCulling(); ApplyDepth(); ApplyPrimitiveRestart(); - ApplyBlending(); + if (dirty.blend_state) { + ApplyBlending(); + dirty.blend_state = false; + } ApplyLogicOp(); ApplyTextures(); ApplySamplers(); - ApplyPolygonOffset(); + if (dirty.polygon_offset) { + ApplyPolygonOffset(); + dirty.polygon_offset = false; + } ApplyAlphaTest(); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index b0140495d..fdf9a8a12 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -195,8 +195,9 @@ public: s_rgb_used = false; } + void SetDefaultViewports(); /// Apply this state as the current OpenGL state - void Apply() const; + void Apply(); void ApplyFramebufferState() const; void ApplyVertexArrayState() const; @@ -237,11 +238,41 @@ public: /// Viewport does not affects glClearBuffer so emulate viewport using scissor test void EmulateViewportWithScissor(); + void MarkDirtyBlendState() { + dirty.blend_state = true; + } + + void MarkDirtyStencilState() { + dirty.stencil_state = true; + } + + void MarkDirtyPolygonOffset() { + dirty.polygon_offset = true; + } + + void MarkDirtyColorMask() { + dirty.color_mask = true; + } + + void AllDirty() { + dirty.blend_state = true; + dirty.stencil_state = true; + dirty.polygon_offset = true; + dirty.color_mask = true; + } + private: static OpenGLState cur_state; // Workaround for sRGB problems caused by QT not supporting srgb output static bool s_rgb_used; + struct { + bool blend_state; + bool stencil_state; + bool viewport_state; + bool polygon_offset; + bool color_mask; + } dirty{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 08ae1a429..408332f90 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -31,6 +31,8 @@ using VideoCore::Surface::SurfaceType; MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128)); MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128)); +MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", + MP_RGB(128, 192, 128)); namespace { @@ -135,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]}; - ASSERT(component_type == format.component_type); return format; } @@ -483,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const auto& dst_params{dst_view->GetSurfaceParams()}; OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ prev_state.Apply(); }); + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); OpenGLState state; state.draw.read_framebuffer = src_framebuffer.handle; state.draw.draw_framebuffer = dst_framebuffer.handle; + state.AllDirty(); state.Apply(); u32 buffers{}; @@ -535,6 +540,7 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, } void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) { + MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy); const auto& src_params = src_surface->GetSurfaceParams(); const auto& dst_params = dst_surface->GetSurfaceParams(); UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1); diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b142521ec..a05cef3b9 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -101,7 +101,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst RendererOpenGL::~RendererOpenGL() = default; -/// Swap buffers (render frame) void RendererOpenGL::SwapBuffers( std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { @@ -109,6 +108,7 @@ void RendererOpenGL::SwapBuffers( // Maintain the rasterizer's state as a priority OpenGLState prev_state = OpenGLState::GetCurState(); + state.AllDirty(); state.Apply(); if (framebuffer) { @@ -130,6 +130,8 @@ void RendererOpenGL::SwapBuffers( DrawScreen(render_window.GetFramebufferLayout()); + rasterizer->TickFrame(); + render_window.SwapBuffers(); } @@ -139,6 +141,7 @@ void RendererOpenGL::SwapBuffers( system.GetPerfStats().BeginSystemFrame(); // Restore the rasterizer state + prev_state.AllDirty(); prev_state.Apply(); } @@ -205,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Link shaders and get variable locations shader.CreateFromSource(vertex_shader, nullptr, fragment_shader); state.draw.shader_program = shader.handle; + state.AllDirty(); state.Apply(); uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix"); uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture"); @@ -262,7 +266,6 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - // Initialize sRGB Usage OpenGLState::ClearsRGBUsed(); rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info); } @@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, // Workaround brigthness problems in SMO by enabling sRGB in the final output // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987 state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed(); + state.AllDirty(); state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data()); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); // Restore default state state.framebuffer_srgb.enabled = false; state.texture_units[0].texture = 0; + state.AllDirty(); state.Apply(); // Clear sRGB state for the next frame OpenGLState::ClearsRGBUsed(); @@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() { GLuint old_read_fb = state.draw.read_framebuffer; GLuint old_draw_fb = state.draw.draw_framebuffer; state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; + state.AllDirty(); state.Apply(); Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() { screenshot_framebuffer.Release(); state.draw.read_framebuffer = old_read_fb; state.draw.draw_framebuffer = old_draw_fb; + state.AllDirty(); state.Apply(); glDeleteRenderbuffers(1, &renderbuffer); diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index 68c36988d..c504a2c1a 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -13,29 +13,67 @@ namespace OpenGL { +VertexArrayPushBuffer::VertexArrayPushBuffer() = default; + +VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; + +void VertexArrayPushBuffer::Setup(GLuint vao_) { + vao = vao_; + index_buffer = nullptr; + vertex_buffers.clear(); +} + +void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) { + index_buffer = buffer; +} + +void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer, + GLintptr offset, GLsizei stride) { + vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride}); +} + +void VertexArrayPushBuffer::Bind() { + if (index_buffer) { + glVertexArrayElementBuffer(vao, *index_buffer); + } + + // TODO(Rodrigo): Find a way to ARB_multi_bind this + for (const auto& entry : vertex_buffers) { + glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset, + entry.stride); + } +} + BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; void BindBuffersRangePushBuffer::Setup(GLuint first_) { first = first_; - buffers.clear(); + buffer_pointers.clear(); offsets.clear(); sizes.clear(); } -void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) { - buffers.push_back(buffer); +void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) { + buffer_pointers.push_back(buffer); offsets.push_back(offset); sizes.push_back(size); } -void BindBuffersRangePushBuffer::Bind() const { - const std::size_t count{buffers.size()}; +void BindBuffersRangePushBuffer::Bind() { + // Ensure sizes are valid. + const std::size_t count{buffer_pointers.size()}; DEBUG_ASSERT(count == offsets.size() && count == sizes.size()); if (count == 0) { return; } + + // Dereference buffers. + buffers.resize(count); + std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(), + [](const GLuint* pointer) { return *pointer; }); + glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(), sizes.data()); } diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 4a752f3b4..6c2b45546 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,20 +11,49 @@ namespace OpenGL { -class BindBuffersRangePushBuffer { +class VertexArrayPushBuffer final { public: - BindBuffersRangePushBuffer(GLenum target); + explicit VertexArrayPushBuffer(); + ~VertexArrayPushBuffer(); + + void Setup(GLuint vao_); + + void SetIndexBuffer(const GLuint* buffer); + + void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset, + GLsizei stride); + + void Bind(); + +private: + struct Entry { + GLuint binding_index{}; + const GLuint* buffer{}; + GLintptr offset{}; + GLsizei stride{}; + }; + + GLuint vao{}; + const GLuint* index_buffer{}; + std::vector<Entry> vertex_buffers; +}; + +class BindBuffersRangePushBuffer final { +public: + explicit BindBuffersRangePushBuffer(GLenum target); ~BindBuffersRangePushBuffer(); void Setup(GLuint first_); - void Push(GLuint buffer, GLintptr offset, GLsizeiptr size); + void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size); - void Bind() const; + void Bind(); private: - GLenum target; - GLuint first; + GLenum target{}; + GLuint first{}; + std::vector<const GLuint*> buffer_pointers; + std::vector<GLuint> buffers; std::vector<GLintptr> offsets; std::vector<GLsizeiptr> sizes; |
