diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
34 files changed, 3180 insertions, 3338 deletions
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 2b9bd142e..f8a807c84 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -2,103 +2,71 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <cstring> #include <memory> -#include "common/alignment.h" -#include "core/core.h" -#include "video_core/memory_manager.h" +#include <glad/glad.h> + +#include "common/assert.h" +#include "common/microprofile.h" +#include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { -CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, - std::size_t alignment, u8* host_ptr) - : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset}, - alignment{alignment} {} - -OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) - : RasterizerCache{rasterizer}, stream_buffer(size, true) {} - -GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment, - bool cache) { - std::lock_guard lock{mutex}; - auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); - - // Cache management is a big overhead, so only cache entries with a given size. - // TODO: Figure out which size is the best for given games. - cache &= size >= 2048; - - const auto& host_ptr{memory_manager.GetPointer(gpu_addr)}; - if (cache) { - auto entry = TryGet(host_ptr); - if (entry) { - if (entry->GetSize() >= size && entry->GetAlignment() == alignment) { - return entry->GetOffset(); - } - Unregister(entry); - } - } - - AlignBuffer(alignment); - const GLintptr uploaded_offset = buffer_offset; - - if (!host_ptr) { - return uploaded_offset; - } - - std::memcpy(buffer_ptr, host_ptr, size); - buffer_ptr += size; - buffer_offset += size; - - if (cache) { - auto entry = std::make_shared<CachedBufferEntry>( - *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr); - Register(entry); - } - - return uploaded_offset; +MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); + +CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size) + : VideoCommon::BufferBlock{cache_addr, size} { + gl_buffer.Create(); + glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); } -GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size, - std::size_t alignment) { - std::lock_guard lock{mutex}; - AlignBuffer(alignment); - std::memcpy(buffer_ptr, raw_pointer, size); - const GLintptr uploaded_offset = buffer_offset; +CachedBufferBlock::~CachedBufferBlock() = default; + +OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + std::size_t stream_size) + : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{ + rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} + +OGLBufferCache::~OGLBufferCache() = default; - buffer_ptr += size; - buffer_offset += size; - return uploaded_offset; +Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) { + return std::make_shared<CachedBufferBlock>(cache_addr, size); } -bool OGLBufferCache::Map(std::size_t max_size) { - bool invalidate; - std::tie(buffer_ptr, buffer_offset_base, invalidate) = - stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4); - buffer_offset = buffer_offset_base; +void OGLBufferCache::WriteBarrier() { + glMemoryBarrier(GL_ALL_BARRIER_BITS); +} + +const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) { + return buffer->GetHandle(); +} - if (invalidate) { - InvalidateAll(); - } - return invalidate; +const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { + static const GLuint null_buffer = 0; + return &null_buffer; } -void OGLBufferCache::Unmap() { - stream_buffer.Unmap(buffer_offset - buffer_offset_base); +void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, + const u8* data) { + glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(size), data); } -GLuint OGLBufferCache::GetHandle() const { - return stream_buffer.GetHandle(); +void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, + u8* data) { + MICROPROFILE_SCOPE(OpenGL_Buffer_Download); + glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(size), data); } -void OGLBufferCache::AlignBuffer(std::size_t alignment) { - // Align the offset, not the mapped pointer - const GLintptr offset_aligned = - static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment)); - buffer_ptr += offset_aligned - buffer_offset; - buffer_offset = offset_aligned; +void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, + std::size_t dst_offset, std::size_t size) { + glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(), + static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset), + static_cast<GLsizeiptr>(size)); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index f2347581b..022e7bfa9 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -4,80 +4,63 @@ #pragma once -#include <cstddef> #include <memory> -#include <tuple> #include "common/common_types.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" +namespace Core { +class System; +} + namespace OpenGL { +class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferEntry final : public RasterizerCacheObject { -public: - explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, - std::size_t alignment, u8* host_ptr); - - VAddr GetCpuAddr() const override { - return cpu_addr; - } +class CachedBufferBlock; - std::size_t GetSizeInBytes() const override { - return size; - } - - std::size_t GetSize() const { - return size; - } +using Buffer = std::shared_ptr<CachedBufferBlock>; - GLintptr GetOffset() const { - return offset; - } +class CachedBufferBlock : public VideoCommon::BufferBlock { +public: + explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size); + ~CachedBufferBlock(); - std::size_t GetAlignment() const { - return alignment; + const GLuint* GetHandle() const { + return &gl_buffer.handle; } private: - VAddr cpu_addr{}; - std::size_t size{}; - GLintptr offset{}; - std::size_t alignment{}; + OGLBuffer gl_buffer{}; }; -class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { +class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> { public: - explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size); - - /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been - /// allocated. - GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool cache = true); + explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + std::size_t stream_size); + ~OGLBufferCache(); - /// Uploads from a host memory. Returns host's buffer offset where it's been allocated. - GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4); + const GLuint* GetEmptyBuffer(std::size_t) override; - bool Map(std::size_t max_size); - void Unmap(); +protected: + Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override; - GLuint GetHandle() const; + void WriteBarrier() override; -protected: - void AlignBuffer(std::size_t alignment); + const GLuint* ToHandle(const Buffer& buffer) override; - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {} + void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, + const u8* data) override; -private: - OGLStreamBuffer stream_buffer; + void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, + u8* data) override; - u8* buffer_ptr = nullptr; - GLintptr buffer_offset = 0; - GLintptr buffer_offset_base = 0; + void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, + std::size_t dst_offset, std::size_t size) override; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index a48e14d2e..4f59a87b4 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -14,52 +14,64 @@ namespace OpenGL { namespace { + template <typename T> T GetInteger(GLenum pname) { GLint temporary; glGetIntegerv(pname, &temporary); return static_cast<T>(temporary); } + +bool TestProgram(const GLchar* glsl) { + const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &glsl)}; + GLint link_status; + glGetProgramiv(shader, GL_LINK_STATUS, &link_status); + glDeleteProgram(shader); + return link_status == GL_TRUE; +} + } // Anonymous namespace Device::Device() { uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); + has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group && + GLAD_GL_NV_shader_thread_shuffle; + has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = TestComponentIndexingBug(); + has_precise_bug = TestPreciseBug(); + + LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); + LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); + LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); } Device::Device(std::nullptr_t) { uniform_buffer_alignment = 0; max_vertex_attributes = 16; max_varyings = 15; + has_warp_intrinsics = true; + has_vertex_viewport_layer = true; has_variable_aoffi = true; has_component_indexing_bug = false; + has_precise_bug = false; } bool Device::TestVariableAoffi() { - const GLchar* AOFFI_TEST = R"(#version 430 core + return TestProgram(R"(#version 430 core // This is a unit test, please ignore me on apitrace bug reports. uniform sampler2D tex; uniform ivec2 variable_offset; out vec4 output_attribute; void main() { output_attribute = textureOffset(tex, vec2(0), variable_offset); -} -)"; - const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)}; - GLint link_status{}; - glGetProgramiv(shader, GL_LINK_STATUS, &link_status); - glDeleteProgram(shader); - - const bool supported{link_status == GL_TRUE}; - LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported); - return supported; +})"); } bool Device::TestComponentIndexingBug() { - constexpr char log_message[] = "Renderer_ComponentIndexingBug: {}"; const GLchar* COMPONENT_TEST = R"(#version 430 core layout (std430, binding = 0) buffer OutputBuffer { uint output_value; @@ -99,12 +111,21 @@ void main() { GLuint result; glGetNamedBufferSubData(ssbo.handle, 0, sizeof(result), &result); if (result != values.at(index)) { - LOG_INFO(Render_OpenGL, log_message, true); return true; } } - LOG_INFO(Render_OpenGL, log_message, false); return false; } +bool Device::TestPreciseBug() { + return !TestProgram(R"(#version 430 core +in vec3 coords; +out float out_value; +uniform sampler2DShadow tex; +void main() { + precise float tmp_value = vec4(texture(tex, coords)).x; + out_value = tmp_value; +})"); +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 8c8c93760..ba6dcd3be 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -18,6 +18,10 @@ public: return uniform_buffer_alignment; } + std::size_t GetShaderStorageBufferAlignment() const { + return shader_storage_alignment; + } + u32 GetMaxVertexAttributes() const { return max_vertex_attributes; } @@ -26,6 +30,14 @@ public: return max_varyings; } + bool HasWarpIntrinsics() const { + return has_warp_intrinsics; + } + + bool HasVertexViewportLayer() const { + return has_vertex_viewport_layer; + } + bool HasVariableAoffi() const { return has_variable_aoffi; } @@ -34,15 +46,24 @@ public: return has_component_indexing_bug; } + bool HasPreciseBug() const { + return has_precise_bug; + } + private: static bool TestVariableAoffi(); static bool TestComponentIndexingBug(); + static bool TestPreciseBug(); std::size_t uniform_buffer_alignment{}; + std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; + bool has_warp_intrinsics{}; + bool has_vertex_viewport_layer{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; + bool has_precise_bug{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp new file mode 100644 index 000000000..7c926bd48 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp @@ -0,0 +1,75 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <tuple> + +#include "common/cityhash.h" +#include "common/scope_exit.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_framebuffer_cache.h" +#include "video_core/renderer_opengl/gl_state.h" + +namespace OpenGL { + +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + +FramebufferCacheOpenGL::FramebufferCacheOpenGL() = default; + +FramebufferCacheOpenGL::~FramebufferCacheOpenGL() = default; + +GLuint FramebufferCacheOpenGL::GetFramebuffer(const FramebufferCacheKey& key) { + const auto [entry, is_cache_miss] = cache.try_emplace(key); + auto& framebuffer{entry->second}; + if (is_cache_miss) { + framebuffer = CreateFramebuffer(key); + } + return framebuffer.handle; +} + +OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheKey& key) { + OGLFramebuffer framebuffer; + framebuffer.Create(); + + // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs. + local_state.draw.draw_framebuffer = framebuffer.handle; + local_state.ApplyFramebufferState(); + + if (key.is_single_buffer) { + if (key.color_attachments[0] != GL_NONE && key.colors[0]) { + key.colors[0]->Attach(key.color_attachments[0], GL_DRAW_FRAMEBUFFER); + glDrawBuffer(key.color_attachments[0]); + } else { + glDrawBuffer(GL_NONE); + } + } else { + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + if (key.colors[index]) { + key.colors[index]->Attach(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), + GL_DRAW_FRAMEBUFFER); + } + } + glDrawBuffers(key.colors_count, key.color_attachments.data()); + } + + if (key.zeta) { + key.zeta->Attach(key.stencil_enable ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT, + GL_DRAW_FRAMEBUFFER); + } + + return framebuffer; +} + +std::size_t FramebufferCacheKey::Hash() const { + static_assert(sizeof(*this) % sizeof(u64) == 0, "Unaligned struct"); + return static_cast<std::size_t>( + Common::CityHash64(reinterpret_cast<const char*>(this), sizeof(*this))); +} + +bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const { + return std::tie(is_single_buffer, stencil_enable, colors_count, color_attachments, colors, + zeta) == std::tie(rhs.is_single_buffer, rhs.stencil_enable, rhs.colors_count, + rhs.color_attachments, rhs.colors, rhs.zeta); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h new file mode 100644 index 000000000..a3a996353 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h @@ -0,0 +1,68 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstddef> +#include <unordered_map> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" + +namespace OpenGL { + +struct alignas(sizeof(u64)) FramebufferCacheKey { + bool is_single_buffer = false; + bool stencil_enable = false; + u16 colors_count = 0; + + std::array<GLenum, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> color_attachments{}; + std::array<View, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors; + View zeta; + + std::size_t Hash() const; + + bool operator==(const FramebufferCacheKey& rhs) const; + + bool operator!=(const FramebufferCacheKey& rhs) const { + return !operator==(rhs); + } +}; + +} // namespace OpenGL + +namespace std { + +template <> +struct hash<OpenGL::FramebufferCacheKey> { + std::size_t operator()(const OpenGL::FramebufferCacheKey& k) const noexcept { + return k.Hash(); + } +}; + +} // namespace std + +namespace OpenGL { + +class FramebufferCacheOpenGL { +public: + FramebufferCacheOpenGL(); + ~FramebufferCacheOpenGL(); + + GLuint GetFramebuffer(const FramebufferCacheKey& key); + +private: + OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key); + + OpenGLState local_state; + std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp deleted file mode 100644 index d5e385151..000000000 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <glad/glad.h> - -#include "common/logging/log.h" -#include "core/core.h" -#include "video_core/memory_manager.h" -#include "video_core/renderer_opengl/gl_global_cache.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_opengl/utils.h" - -namespace OpenGL { - -CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size) - : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size}, - max_size{max_size} { - buffer.Create(); - LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory"); -} - -CachedGlobalRegion::~CachedGlobalRegion() = default; - -void CachedGlobalRegion::Reload(u32 size_) { - size = size_; - if (size > max_size) { - size = max_size; - LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_, - max_size); - } - glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW); -} - -void CachedGlobalRegion::Flush() { - LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr); - glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr); -} - -GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const { - const auto search{reserve.find(addr)}; - if (search == reserve.end()) { - return {}; - } - return search->second; -} - -GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, - u32 size) { - GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; - if (!region) { - // No reserved surface available, create a new one and reserve it - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)}; - ASSERT(cpu_addr); - - region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size); - ReserveGlobalRegion(region); - } - region->Reload(size); - return region; -} - -void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { - reserve.insert_or_assign(region->GetCacheAddr(), std::move(region)); -} - -GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) - : RasterizerCache{rasterizer} { - GLint max_ssbo_size_; - glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_); - max_ssbo_size = static_cast<u32>(max_ssbo_size_); -} - -GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( - const GLShader::GlobalMemoryEntry& global_region, - Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) { - std::lock_guard lock{mutex}; - - auto& gpu{Core::System::GetInstance().GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; - const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + - global_region.GetCbufOffset()}; - const auto actual_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - - // Look up global region in the cache based on address - const auto& host_ptr{memory_manager.GetPointer(actual_addr)}; - GlobalRegion region{TryGet(host_ptr)}; - - if (!region) { - // No global region found - create a new one - region = GetUncachedGlobalRegion(actual_addr, host_ptr, size); - Register(region); - } - - return region; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h deleted file mode 100644 index 2d467a240..000000000 --- a/src/video_core/renderer_opengl/gl_global_cache.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <unordered_map> - -#include <glad/glad.h> - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" - -namespace OpenGL { - -namespace GLShader { -class GlobalMemoryEntry; -} - -class RasterizerOpenGL; -class CachedGlobalRegion; -using GlobalRegion = std::shared_ptr<CachedGlobalRegion>; - -class CachedGlobalRegion final : public RasterizerCacheObject { -public: - explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size); - ~CachedGlobalRegion(); - - VAddr GetCpuAddr() const override { - return cpu_addr; - } - - std::size_t GetSizeInBytes() const override { - return size; - } - - /// Gets the GL program handle for the buffer - GLuint GetBufferHandle() const { - return buffer.handle; - } - - /// Reloads the global region from guest memory - void Reload(u32 size_); - - void Flush(); - -private: - VAddr cpu_addr{}; - u8* host_ptr{}; - u32 size{}; - u32 max_size{}; - - OGLBuffer buffer; -}; - -class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> { -public: - explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer); - - /// Gets the current specified shader stage program - GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor, - Tegra::Engines::Maxwell3D::Regs::ShaderStage stage); - -protected: - void FlushObjectInner(const GlobalRegion& object) override { - object->Flush(); - } - -private: - GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; - GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size); - void ReserveGlobalRegion(GlobalRegion region); - - std::unordered_map<CacheAddr, GlobalRegion> reserve; - u32 max_ssbo_size{}; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d77426067..4e266cdad 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -4,6 +4,7 @@ #include <algorithm> #include <array> +#include <bitset> #include <memory> #include <string> #include <string_view> @@ -19,7 +20,9 @@ #include "core/core.h" #include "core/hle/kernel/process.h" #include "core/settings.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -29,8 +32,10 @@ namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using PixelFormat = VideoCore::Surface::PixelFormat; -using SurfaceType = VideoCore::Surface::SurfaceType; + +using VideoCore::Surface::PixelFormat; +using VideoCore::Surface::SurfaceTarget; +using VideoCore::Surface::SurfaceType; MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192)); @@ -78,36 +83,31 @@ struct DrawParameters { } }; -struct FramebufferCacheKey { - bool is_single_buffer = false; - bool stencil_enable = false; - - std::array<GLenum, Maxwell::NumRenderTargets> color_attachments{}; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors{}; - u32 colors_count = 0; - - GLuint zeta = 0; - - auto Tie() const { - return std::tie(is_single_buffer, stencil_enable, color_attachments, colors, colors_count, - zeta); +static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, + const GLShader::ConstBufferEntry& entry) { + if (!entry.IsIndirect()) { + return entry.GetSize(); } - bool operator<(const FramebufferCacheKey& rhs) const { - return Tie() < rhs.Tie(); + if (buffer.size > Maxwell::MaxConstBufferSize) { + LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, + Maxwell::MaxConstBufferSize); + return Maxwell::MaxConstBufferSize; } -}; + + return buffer.size; +} RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) - : res_cache{*this}, shader_cache{*this, system, emu_window, device}, - global_cache{*this}, system{system}, screen_info{info}, - buffer_cache(*this, STREAM_BUFFER_SIZE) { + : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, + system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { OpenGLState::ApplyDefaultState(); shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); + clear_framebuffer.Create(); LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); CheckExtensions(); @@ -121,21 +121,16 @@ void RasterizerOpenGL::CheckExtensions() { Render_OpenGL, "Anisotropic filter is not supported! This can cause graphical issues in some games."); } - if (!GLAD_GL_ARB_buffer_storage) { - LOG_WARNING( - Render_OpenGL, - "Buffer storage control is not supported! This can cause performance degradation."); - } } GLuint RasterizerOpenGL::SetupVertexFormat() { auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; - if (!gpu.dirty_flags.vertex_attrib_format) { + if (!gpu.dirty.vertex_attrib_format) { return state.draw.vertex_array; } - gpu.dirty_flags.vertex_attrib_format = false; + gpu.dirty.vertex_attrib_format = false; MICROPROFILE_SCOPE(OpenGL_VAO); @@ -152,8 +147,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { state.draw.vertex_array = vao; state.ApplyVertexArrayState(); - glVertexArrayElementBuffer(vao, buffer_cache.GetHandle()); - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. // Enables the first 16 vertex attributes always, as we don't know which ones are actually // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16 @@ -191,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { } // Rebinding the VAO invalidates the vertex buffer bindings. - gpu.dirty_flags.vertex_array.set(); + gpu.dirty.ResetVertexArrays(); state.draw.vertex_array = vao_entry.handle; return vao_entry.handle; @@ -199,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() { void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - if (gpu.dirty_flags.vertex_array.none()) + if (!gpu.dirty.vertex_array_buffers) return; + gpu.dirty.vertex_array_buffers = false; + + const auto& regs = gpu.regs; MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty_flags.vertex_array[index]) + if (!gpu.dirty.vertex_array[index]) continue; + gpu.dirty.vertex_array[index] = false; + gpu.dirty.vertex_instance[index] = false; const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) @@ -220,11 +216,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { ASSERT(end > start); const u64 size = end - start + 1; - const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size); + const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); // Bind the vertex array to the buffer at the current offset. - glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset, - vertex_array.stride); + vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset, + vertex_array.stride); if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { // Enable vertex buffer instancing with the specified divisor. @@ -234,11 +230,47 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { glVertexArrayBindingDivisor(vao, index, 0); } } +} + +void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { + auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.dirty.vertex_instances) + return; + gpu.dirty.vertex_instances = false; + + const auto& regs = gpu.regs; + // Upload all guest vertex arrays sequentially to our buffer + for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (!gpu.dirty.vertex_instance[index]) + continue; - gpu.dirty_flags.vertex_array.reset(); + gpu.dirty.vertex_instance[index] = false; + + if (regs.instanced_arrays.IsInstancingEnabled(index) && + regs.vertex_array[index].divisor != 0) { + // Enable vertex buffer instancing with the specified divisor. + glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); + } else { + // Disable the vertex buffer instancing. + glVertexArrayBindingDivisor(vao, index, 0); + } + } +} + +GLintptr RasterizerOpenGL::SetupIndexBuffer() { + if (accelerate_draw != AccelDraw::Indexed) { + return 0; + } + MICROPROFILE_SCOPE(OpenGL_Index); + const auto& regs = system.GPU().Maxwell3D().regs; + const std::size_t size = CalculateIndexBufferSize(); + const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + vertex_array_pushbuffer.SetIndexBuffer(buffer); + return offset; } -DrawParameters RasterizerOpenGL::SetupDraw() { +DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) { const auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; const bool is_indexed = accelerate_draw == AccelDraw::Indexed; @@ -250,11 +282,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() { params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); if (is_indexed) { - MICROPROFILE_SCOPE(OpenGL_Index); params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); params.count = regs.index_array.count; - params.index_buffer_offset = - buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize()); + params.index_buffer_offset = index_buffer_offset; params.base_vertex = static_cast<GLint>(regs.vb_element_base); } else { params.count = regs.vertex_buffer.count; @@ -270,10 +300,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { BaseBindings base_bindings; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; - // Prepare packed bindings - bind_ubo_pushbuffer.Setup(base_bindings.cbuf); - bind_ssbo_pushbuffer.Setup(base_bindings.gmem); - for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -294,16 +320,21 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { GLShader::MaxwellUniformData ubo{}; ubo.SetFromRegs(gpu, stage); - const GLintptr offset = + const auto [buffer, offset] = buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); // Bind the emulation info buffer - bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, - static_cast<GLsizeiptr>(sizeof(ubo))); + bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo))); Shader shader{shader_cache.GetStageProgram(program)}; - const auto [program_handle, next_bindings] = - shader->GetProgramHandle(primitive_mode, base_bindings); + + const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage); + SetupDrawConstBuffers(stage_enum, shader); + SetupDrawGlobalMemory(stage_enum, shader); + const auto texture_buffer_usage{SetupDrawTextures(stage_enum, shader, base_bindings)}; + + const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage}; + const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant); switch (program) { case Maxwell::ShaderProgram::VertexA: @@ -321,11 +352,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { shader_config.enable.Value(), shader_config.offset); } - const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage); - SetupDrawConstBuffers(stage_enum, shader); - SetupGlobalRegions(stage_enum, shader); - SetupTextures(stage_enum, shader, base_bindings); - // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -343,50 +369,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { base_bindings = next_bindings; } - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - SyncClipEnabled(clip_distances); - gpu.dirty_flags.shaders = false; -} - -void RasterizerOpenGL::SetupCachedFramebuffer(const FramebufferCacheKey& fbkey, - OpenGLState& current_state) { - const auto [entry, is_cache_miss] = framebuffer_cache.try_emplace(fbkey); - auto& framebuffer = entry->second; - - if (is_cache_miss) - framebuffer.Create(); - - current_state.draw.draw_framebuffer = framebuffer.handle; - current_state.ApplyFramebufferState(); - - if (!is_cache_miss) - return; - - if (fbkey.is_single_buffer) { - if (fbkey.color_attachments[0] != GL_NONE) { - glFramebufferTexture(GL_DRAW_FRAMEBUFFER, fbkey.color_attachments[0], fbkey.colors[0], - 0); - } - glDrawBuffer(fbkey.color_attachments[0]); - } else { - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - if (fbkey.colors[index]) { - glFramebufferTexture(GL_DRAW_FRAMEBUFFER, - GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), - fbkey.colors[index], 0); - } - } - glDrawBuffers(fbkey.colors_count, fbkey.color_attachments.data()); - } - - if (fbkey.zeta) { - GLenum zeta_attachment = - fbkey.stencil_enable ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT; - glFramebufferTexture(GL_DRAW_FRAMEBUFFER, zeta_attachment, fbkey.zeta, 0); - } + gpu.dirty.shaders = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -469,18 +454,22 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents, single_color_target}; - if (fb_config_state == current_framebuffer_config_state && - gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) { + if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) { // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or // single color targets). This is done because the guest registers may not change but the // host framebuffer may contain different attachments return current_depth_stencil_usage; } + gpu.dirty.render_settings = false; current_framebuffer_config_state = fb_config_state; - Surface depth_surface; + texture_cache.GuardRenderTargets(true); + + View depth_surface{}; if (using_depth_fb) { - depth_surface = res_cache.GetDepthBufferSurface(preserve_contents); + depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); + } else { + texture_cache.SetEmptyDepthBuffer(); } UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); @@ -493,13 +482,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( if (using_color_fb) { if (single_color_target) { // Used when just a single color attachment is enabled, e.g. for clearing a color buffer - Surface color_surface = - res_cache.GetColorBufferSurface(*single_color_target, preserve_contents); + View color_surface{ + texture_cache.GetColorBufferSurface(*single_color_target, preserve_contents)}; if (color_surface) { // Assume that a surface will be written to if it is used as a framebuffer, even if // the shader doesn't actually write to it. - color_surface->MarkAsModified(true, res_cache); + texture_cache.MarkColorBufferInUse(*single_color_target); // Workaround for and issue in nvidia drivers // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/ state.framebuffer_srgb.enabled |= color_surface->GetSurfaceParams().srgb_conversion; @@ -508,16 +497,21 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( fbkey.is_single_buffer = true; fbkey.color_attachments[0] = GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target); - fbkey.colors[0] = color_surface != nullptr ? color_surface->Texture().handle : 0; + fbkey.colors[0] = color_surface; + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + if (index != *single_color_target) { + texture_cache.SetEmptyColorBuffer(index); + } + } } else { // Multiple color attachments are enabled for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents); + View color_surface{texture_cache.GetColorBufferSurface(index, preserve_contents)}; if (color_surface) { // Assume that a surface will be written to if it is used as a framebuffer, even // if the shader doesn't actually write to it. - color_surface->MarkAsModified(true, res_cache); + texture_cache.MarkColorBufferInUse(index); // Enable sRGB only for supported formats // Workaround for and issue in nvidia drivers // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/ @@ -527,8 +521,7 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( fbkey.color_attachments[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); - fbkey.colors[index] = - color_surface != nullptr ? color_surface->Texture().handle : 0; + fbkey.colors[index] = color_surface; } fbkey.is_single_buffer = false; fbkey.colors_count = regs.rt_control.count; @@ -541,26 +534,84 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( if (depth_surface) { // Assume that a surface will be written to if it is used as a framebuffer, even if // the shader doesn't actually write to it. - depth_surface->MarkAsModified(true, res_cache); + texture_cache.MarkDepthBufferInUse(); - fbkey.zeta = depth_surface->Texture().handle; - fbkey.stencil_enable = regs.stencil_enable && - depth_surface->GetSurfaceParams().type == SurfaceType::DepthStencil; + fbkey.zeta = depth_surface; + fbkey.stencil_enable = depth_surface->GetSurfaceParams().type == SurfaceType::DepthStencil; } - SetupCachedFramebuffer(fbkey, current_state); + texture_cache.GuardRenderTargets(false); + + current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(fbkey); SyncViewport(current_state); return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable}; } +void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb) { + auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + texture_cache.GuardRenderTargets(true); + View color_surface{}; + if (using_color_fb) { + color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false); + } + View depth_surface{}; + if (using_depth_fb || using_stencil_fb) { + depth_surface = texture_cache.GetDepthBufferSurface(false); + } + texture_cache.GuardRenderTargets(false); + + current_state.draw.draw_framebuffer = clear_framebuffer.handle; + current_state.ApplyFramebufferState(); + + if (color_surface) { + color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + } + + if (depth_surface) { + const auto& params = depth_surface->GetSurfaceParams(); + switch (params.type) { + case VideoCore::Surface::SurfaceType::Depth: + depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + break; + case VideoCore::Surface::SurfaceType::DepthStencil: + depth_surface->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + break; + default: + UNIMPLEMENTED(); + } + } else { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + } +} + void RasterizerOpenGL::Clear() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& maxwell3d = system.GPU().Maxwell3D(); + + if (!maxwell3d.ShouldExecute()) { + return; + } + + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; - OpenGLState clear_state; + OpenGLState prev_state{OpenGLState::GetCurState()}; + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); + + OpenGLState clear_state{OpenGLState::GetCurState()}; + clear_state.SetDefaultViewports(); if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; @@ -580,11 +631,13 @@ void RasterizerOpenGL::Clear() { // true. clear_state.depth.test_enabled = true; clear_state.depth.test_func = GL_ALWAYS; + clear_state.depth.write_mask = GL_TRUE; } if (regs.clear_buffers.S) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); use_stencil = true; clear_state.stencil.test_enabled = true; + if (regs.clear_flags.stencil) { // Stencil affects the clear so fill it with the used masks clear_state.stencil.front.test_func = GL_ALWAYS; @@ -616,8 +669,9 @@ void RasterizerOpenGL::Clear() { return; } - const auto [clear_depth, clear_stencil] = ConfigureFramebuffers( - clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value()); + ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + + SyncViewport(clear_state); if (regs.clear_flags.scissor) { SyncScissorTest(clear_state); } @@ -626,20 +680,18 @@ void RasterizerOpenGL::Clear() { clear_state.EmulateViewportWithScissor(); } - clear_state.ApplyColorMask(); - clear_state.ApplyDepth(); - clear_state.ApplyStencilTest(); - clear_state.ApplyViewport(); + clear_state.AllDirty(); + clear_state.Apply(); if (use_color) { - glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); + glClearBufferfv(GL_COLOR, 0, regs.clear_color); } - if (clear_depth && clear_stencil) { + if (use_depth && use_stencil) { glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); - } else if (clear_depth) { + } else if (use_depth) { glClearBufferfv(GL_DEPTH, 0, ®s.clear_depth); - } else if (clear_stencil) { + } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } } @@ -650,9 +702,11 @@ void RasterizerOpenGL::DrawArrays() { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - ConfigureFramebuffers(state); + if (!gpu.ShouldExecute()) { + return; + } + SyncColorMask(); SyncFragmentColorClampState(); SyncMultiSampleState(); @@ -684,31 +738,102 @@ void RasterizerOpenGL::DrawArrays() { Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers - buffer_size += - Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - const bool invalidate = buffer_cache.Map(buffer_size); - if (invalidate) { - // As all cached buffers are invalidated, we need to recheck their state. - gpu.dirty_flags.vertex_array.set(); - } + // Prepare the vertex array. + buffer_cache.Map(buffer_size); + // Prepare vertex array format. const GLuint vao = SetupVertexFormat(); + vertex_array_pushbuffer.Setup(vao); + + // Upload vertex and index data. SetupVertexBuffer(vao); + SetupVertexInstances(vao); + const GLintptr index_buffer_offset = SetupIndexBuffer(); + + // Setup draw parameters. It will automatically choose what glDraw* method to use. + const DrawParameters params = SetupDraw(index_buffer_offset); - DrawParameters params = SetupDraw(); + // Prepare packed bindings. + bind_ubo_pushbuffer.Setup(0); + bind_ssbo_pushbuffer.Setup(0); + + // Setup shaders and their used resources. + texture_cache.GuardSamplers(true); SetupShaders(params.primitive_mode); + texture_cache.GuardSamplers(false); - buffer_cache.Unmap(); + ConfigureFramebuffers(state); + + // Signal the buffer cache that we are not going to upload more things. + const bool invalidate = buffer_cache.Unmap(); + + // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. + vertex_array_pushbuffer.Bind(); + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + + if (invalidate) { + // As all cached buffers are invalidated, we need to recheck their state. + gpu.dirty.ResetVertexArrays(); + } shader_program_manager->ApplyTo(state); state.Apply(); - res_cache.SignalPreDrawCall(); + if (texture_cache.TextureBarrier()) { + glTextureBarrier(); + } + params.DispatchDraw(); - res_cache.SignalPostDrawCall(); accelerate_draw = AccelDraw::Disabled; + gpu.dirty.memory_general = false; +} + +void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { + if (!GLAD_GL_ARB_compute_variable_group_size) { + LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the " + "lack of GL_ARB_compute_variable_group_size"); + return; + } + + auto kernel = shader_cache.GetComputeKernel(code_addr); + ProgramVariant variant; + variant.texture_buffer_usage = SetupComputeTextures(kernel); + SetupComputeImages(kernel); + + const auto [program, next_bindings] = kernel->GetProgramHandle(variant); + state.draw.shader_program = program; + state.draw.program_pipeline = 0; + + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); + + bind_ubo_pushbuffer.Setup(0); + bind_ssbo_pushbuffer.Setup(0); + + SetupComputeConstBuffers(kernel); + SetupComputeGlobalMemory(kernel); + + buffer_cache.Unmap(); + + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + + state.ApplyTextures(); + state.ApplyImages(); + state.ApplyShaderProgram(); + state.ApplyProgramPipeline(); + + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y, + launch_desc.grid_dim_z, launch_desc.block_dim_x, + launch_desc.block_dim_y, launch_desc.block_dim_z); } void RasterizerOpenGL::FlushAll() {} @@ -718,8 +843,8 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { if (!addr || !size) { return; } - res_cache.FlushRegion(addr, size); - global_cache.FlushRegion(addr, size); + texture_cache.FlushRegion(addr, size); + buffer_cache.FlushRegion(addr, size); } void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { @@ -727,23 +852,31 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { if (!addr || !size) { return; } - res_cache.InvalidateRegion(addr, size); + texture_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size); - global_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { - FlushRegion(addr, size); + if (Settings::values.use_accurate_gpu_emulation) { + FlushRegion(addr, size); + } InvalidateRegion(addr, size); } +void RasterizerOpenGL::FlushCommands() { + glFlush(); +} + +void RasterizerOpenGL::TickFrame() { + buffer_cache.TickFrame(); +} + bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst, - const Common::Rectangle<u32>& src_rect, - const Common::Rectangle<u32>& dst_rect) { + const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); - res_cache.FermiCopySurface(src, dst, src_rect, dst_rect); + texture_cache.DoFermiCopy(src, dst, copy_config); return true; } @@ -755,7 +888,8 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, MICROPROFILE_SCOPE(OpenGL_CacheManagement); - const auto& surface{res_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))}; + const auto surface{ + texture_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))}; if (!surface) { return {}; } @@ -771,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, LOG_WARNING(Render_OpenGL, "Framebuffer pixel_format is different"); } - screen_info.display_texture = surface->Texture().handle; + screen_info.display_texture = surface->GetTexture(); return true; } @@ -779,14 +913,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto stage_index = static_cast<std::size_t>(stage); - const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index]; - const auto& entries = shader->GetShaderEntries().const_buffers; + const auto& stages = system.GPU().Maxwell3D().state.shader_stages; + const auto& shader_stage = stages[static_cast<std::size_t>(stage)]; + for (const auto& entry : shader->GetShaderEntries().const_buffers) { + const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); + } +} - // Upload only the enabled buffers from the 16 constbuffers of each shader stage - for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { - const auto& entry = entries[bindpoint]; - SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry); +void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { + MICROPROFILE_SCOPE(OpenGL_UBO); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + for (const auto& entry : kernel->GetShaderEntries().const_buffers) { + const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; + const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); + Tegra::Engines::ConstBufferInfo buffer; + buffer.address = config.Address(); + buffer.size = config.size; + buffer.enabled = mask[entry.GetIndex()]; + SetupConstBuffer(buffer, entry); } } @@ -794,84 +939,169 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b const GLShader::ConstBufferEntry& entry) { if (!buffer.enabled) { // Set values to zero to unbind buffers - bind_ubo_pushbuffer.Push(0, 0, 0); + bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); return; } - std::size_t size; - if (entry.IsIndirect()) { - // Buffer is accessed indirectly, so upload the entire thing - size = buffer.size; - - if (size > MaxConstbufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size, - MaxConstbufferSize); - size = MaxConstbufferSize; - } - } else { - // Buffer is accessed directly, upload just what we use - size = entry.GetSize(); - } - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 // UBO alignment requirements. - size = Common::AlignUp(size, sizeof(GLvec4)); - ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big"); + const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const std::size_t alignment = device.GetUniformBufferAlignment(); - const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment); - bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size); + const auto alignment = device.GetUniformBufferAlignment(); + const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); + bind_ubo_pushbuffer.Push(cbuf, offset, size); } -void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader) { - const auto& entries = shader->GetShaderEntries().global_memory_entries; - for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { - const auto& entry{entries[bindpoint]}; - const auto& region{global_cache.GetGlobalRegion(entry, stage)}; - if (entry.IsWritten()) { - region->MarkAsModified(true, global_cache); - } - bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, - static_cast<GLsizeiptr>(region->GetSizeInBytes())); +void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader) { + auto& gpu{system.GPU()}; + auto& memory_manager{gpu.MemoryManager()}; + const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]}; + for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { + const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); + } +} + +void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { + auto& gpu{system.GPU()}; + auto& memory_manager{gpu.MemoryManager()}; + const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; + for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { + const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; + const auto gpu_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; + SetupGlobalMemory(entry, gpu_addr, size); } } -void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader, - BaseBindings base_bindings) { +void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, + GPUVAddr gpu_addr, std::size_t size) { + const auto alignment{device.GetShaderStorageBufferAlignment()}; + const auto [ssbo, buffer_offset] = + buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten()); + bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); +} + +TextureBufferUsage RasterizerOpenGL::SetupDrawTextures(Maxwell::ShaderStage stage, + const Shader& shader, + BaseBindings base_bindings) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& gpu = system.GPU(); const auto& maxwell3d = gpu.Maxwell3D(); const auto& entries = shader->GetShaderEntries().samplers; - ASSERT_MSG(base_bindings.sampler + entries.size() <= std::size(state.texture_units), + ASSERT_MSG(base_bindings.sampler + entries.size() <= std::size(state.textures), "Exceeded the number of active textures."); + TextureBufferUsage texture_buffer_usage{0}; + for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { const auto& entry = entries[bindpoint]; - Tegra::Texture::FullTextureInfo texture; - if (entry.IsBindless()) { + const auto texture = [&]() { + if (!entry.IsBindless()) { + return maxwell3d.GetStageTexture(stage, entry.GetOffset()); + } const auto cbuf = entry.GetBindlessCBuf(); Tegra::Texture::TextureHandle tex_handle; tex_handle.raw = maxwell3d.AccessConstBuffer32(stage, cbuf.first, cbuf.second); - texture = maxwell3d.GetTextureInfo(tex_handle, entry.GetOffset()); - } else { - texture = maxwell3d.GetStageTexture(stage, entry.GetOffset()); + return maxwell3d.GetTextureInfo(tex_handle, entry.GetOffset()); + }(); + + if (SetupTexture(base_bindings.sampler + bindpoint, texture, entry)) { + texture_buffer_usage.set(bindpoint); } - const u32 current_bindpoint = base_bindings.sampler + bindpoint; + } - state.texture_units[current_bindpoint].sampler = sampler_cache.GetSampler(texture.tsc); + return texture_buffer_usage; +} - if (Surface surface = res_cache.GetTextureSurface(texture, entry); surface) { - state.texture_units[current_bindpoint].texture = - surface->Texture(entry.IsArray()).handle; - surface->UpdateSwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, - texture.tic.w_source); - } else { - // Can occur when texture addr is null or its memory is unmapped/invalid - state.texture_units[current_bindpoint].texture = 0; +TextureBufferUsage RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { + MICROPROFILE_SCOPE(OpenGL_Texture); + const auto& compute = system.GPU().KeplerCompute(); + const auto& entries = kernel->GetShaderEntries().samplers; + + ASSERT_MSG(entries.size() <= std::size(state.textures), + "Exceeded the number of active textures."); + + TextureBufferUsage texture_buffer_usage{0}; + + for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { + const auto& entry = entries[bindpoint]; + const auto texture = [&]() { + if (!entry.IsBindless()) { + return compute.GetTexture(entry.GetOffset()); + } + const auto cbuf = entry.GetBindlessCBuf(); + Tegra::Texture::TextureHandle tex_handle; + tex_handle.raw = compute.AccessConstBuffer32(cbuf.first, cbuf.second); + return compute.GetTextureInfo(tex_handle, entry.GetOffset()); + }(); + + if (SetupTexture(bindpoint, texture, entry)) { + texture_buffer_usage.set(bindpoint); } } + + return texture_buffer_usage; +} + +bool RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, + const GLShader::SamplerEntry& entry) { + state.samplers[binding] = sampler_cache.GetSampler(texture.tsc); + + const auto view = texture_cache.GetTextureSurface(texture.tic, entry); + if (!view) { + // Can occur when texture addr is null or its memory is unmapped/invalid + state.textures[binding] = 0; + return false; + } + state.textures[binding] = view->GetTexture(); + + if (view->GetSurfaceParams().IsBuffer()) { + return true; + } + + // Apply swizzle to textures that are not buffers. + view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, + texture.tic.w_source); + return false; +} + +void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { + const auto& compute = system.GPU().KeplerCompute(); + const auto& entries = shader->GetShaderEntries().images; + for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { + const auto& entry = entries[bindpoint]; + const auto tic = [&]() { + if (!entry.IsBindless()) { + return compute.GetTexture(entry.GetOffset()).tic; + } + const auto cbuf = entry.GetBindlessCBuf(); + Tegra::Texture::TextureHandle tex_handle; + tex_handle.raw = compute.AccessConstBuffer32(cbuf.first, cbuf.second); + return compute.GetTextureInfo(tex_handle, entry.GetOffset()).tic; + }(); + SetupImage(bindpoint, tic, entry); + } +} + +void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, + const GLShader::ImageEntry& entry) { + const auto view = texture_cache.GetImageSurface(tic, entry); + if (!view) { + state.images[binding] = 0; + return; + } + if (!tic.IsBuffer()) { + view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + } + if (entry.IsWritten()) { + view->MarkAsModified(texture_cache.Tick()); + } + state.images[binding] = view->GetTexture(); } void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) { @@ -915,10 +1145,11 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); - state.cull.enabled = regs.cull.enabled != 0; + const auto& regs = maxwell3d.regs; + state.cull.enabled = regs.cull.enabled != 0; if (state.cull.enabled) { state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face); state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); @@ -951,15 +1182,23 @@ void RasterizerOpenGL::SyncDepthTestState() { state.depth.test_enabled = regs.depth_test_enable != 0; state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; - if (!state.depth.test_enabled) + if (!state.depth.test_enabled) { return; + } state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func); } void RasterizerOpenGL::SyncStencilTestState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.stencil_test) { + return; + } + maxwell3d.dirty.stencil_test = false; + + const auto& regs = maxwell3d.regs; state.stencil.test_enabled = regs.stencil_enable != 0; + state.MarkDirtyStencilState(); if (!regs.stencil_enable) { return; @@ -992,7 +1231,12 @@ void RasterizerOpenGL::SyncStencilTestState() { } void RasterizerOpenGL::SyncColorMask() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.color_mask) { + return; + } + const auto& regs = maxwell3d.regs; + const std::size_t count = regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; for (std::size_t i = 0; i < count; i++) { @@ -1003,6 +1247,9 @@ void RasterizerOpenGL::SyncColorMask() { dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; } + + state.MarkDirtyColorMask(); + maxwell3d.dirty.color_mask = false; } void RasterizerOpenGL::SyncMultiSampleState() { @@ -1017,7 +1264,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() { } void RasterizerOpenGL::SyncBlendState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.blend_state) { + return; + } + const auto& regs = maxwell3d.regs; state.blend_color.red = regs.blend_color.r; state.blend_color.green = regs.blend_color.g; @@ -1040,6 +1291,8 @@ void RasterizerOpenGL::SyncBlendState() { for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { state.blend[i].enabled = false; } + maxwell3d.dirty.blend_state = false; + state.MarkDirtyBlendState(); return; } @@ -1056,6 +1309,9 @@ void RasterizerOpenGL::SyncBlendState() { blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); } + + state.MarkDirtyBlendState(); + maxwell3d.dirty.blend_state = false; } void RasterizerOpenGL::SyncLogicOpState() { @@ -1107,13 +1363,21 @@ void RasterizerOpenGL::SyncPointState() { } void RasterizerOpenGL::SyncPolygonOffset() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& maxwell3d = system.GPU().Maxwell3D(); + if (!maxwell3d.dirty.polygon_offset) { + return; + } + const auto& regs = maxwell3d.regs; + state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; state.polygon_offset.units = regs.polygon_offset_units; state.polygon_offset.factor = regs.polygon_offset_factor; state.polygon_offset.clamp = regs.polygon_offset_clamp; + + state.MarkDirtyPolygonOffset(); + maxwell3d.dirty.polygon_offset = false; } void RasterizerOpenGL::SyncAlphaTest() { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f7671ff5d..eada752e0 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -23,15 +23,16 @@ #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_global_cache.h" -#include "video_core/renderer_opengl/gl_rasterizer_cache.h" +#include "video_core/renderer_opengl/gl_framebuffer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/textures/texture.h" namespace Core { class System; @@ -41,11 +42,14 @@ namespace Core::Frontend { class EmuWindow; } +namespace Tegra { +class MemoryManager; +} + namespace OpenGL { struct ScreenInfo; struct DrawParameters; -struct FramebufferCacheKey; class RasterizerOpenGL : public VideoCore::RasterizerInterface { public: @@ -55,14 +59,16 @@ public: void DrawArrays() override; void Clear() override; + void DispatchCompute(GPUVAddr code_addr) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; + void FlushCommands() override; + void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst, - const Common::Rectangle<u32>& src_rect, - const Common::Rectangle<u32>& dst_rect) override; + const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; bool AccelerateDrawBatch(bool is_indexed) override; @@ -70,11 +76,6 @@ public: void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; - /// Maximum supported size that a constbuffer can have in bytes. - static constexpr std::size_t MaxConstbufferSize = 0x10000; - static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, - "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); - private: struct FramebufferConfigState { bool using_color_fb{}; @@ -95,32 +96,64 @@ private: /** * Configures the color and depth framebuffer states. - * @param use_color_fb If true, configure color framebuffers. - * @param using_depth_fb If true, configure the depth/stencil framebuffer. - * @param preserve_contents If true, tries to preserve data from a previously used framebuffer. + * + * @param current_state The current OpenGL state. + * @param using_color_fb If true, configure color framebuffers. + * @param using_depth_fb If true, configure the depth/stencil framebuffer. + * @param preserve_contents If true, tries to preserve data from a previously used + * framebuffer. * @param single_color_target Specifies if a single color buffer target should be used. + * * @returns If depth (first) or stencil (second) are being stored in the bound zeta texture - * (requires using_depth_fb to be true) + * (requires using_depth_fb to be true) */ std::pair<bool, bool> ConfigureFramebuffers( - OpenGLState& current_state, bool use_color_fb = true, bool using_depth_fb = true, + OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true, bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); + void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool using_stencil_fb); + /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader); + /// Configures the current constbuffers to use for the kernel invocation. + void SetupComputeConstBuffers(const Shader& kernel); + /// Configures a constant buffer. void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer, const GLShader::ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. - void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader); + void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader); + + /// Configures the current global memory entries to use for the kernel invocation. + void SetupComputeGlobalMemory(const Shader& kernel); + + /// Configures a constant buffer. + void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, + std::size_t size); + + /// Configures the current textures to use for the draw command. Returns shaders texture buffer + /// usage. + TextureBufferUsage SetupDrawTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader, BaseBindings base_bindings); + + /// Configures the textures used in a compute shader. Returns texture buffer usage. + TextureBufferUsage SetupComputeTextures(const Shader& kernel); + + /// Configures a texture. Returns true when the texture is a texture buffer. + bool SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, + const GLShader::SamplerEntry& entry); - /// Configures the current textures to use for the draw command. - void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader, - BaseBindings base_bindings); + /// Configures images in a compute shader. + void SetupComputeImages(const Shader& shader); + + /// Configures an image. + void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, + const GLShader::ImageEntry& entry); /// Syncs the viewport and depth range to match the guest state void SyncViewport(OpenGLState& current_state); @@ -181,10 +214,10 @@ private: const Device device; OpenGLState state; - RasterizerCacheOpenGL res_cache; + TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; - GlobalRegionCacheOpenGL global_cache; SamplerCacheOpenGL sampler_cache; + FramebufferCacheOpenGL framebuffer_cache; Core::System& system; ScreenInfo& screen_info; @@ -195,13 +228,13 @@ private: OGLVertexArray> vertex_array_cache; - std::map<FramebufferCacheKey, OGLFramebuffer> framebuffer_cache; FramebufferConfigState current_framebuffer_config_state; std::pair<bool, bool> current_depth_stencil_usage{}; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; + VertexArrayPushBuffer vertex_array_pushbuffer; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; @@ -213,16 +246,19 @@ private: GLuint SetupVertexFormat(); void SetupVertexBuffer(GLuint vao); + void SetupVertexInstances(GLuint vao); - DrawParameters SetupDraw(); + GLintptr SetupIndexBuffer(); - void SetupShaders(GLenum primitive_mode); + DrawParameters SetupDraw(GLintptr index_buffer_offset); - void SetupCachedFramebuffer(const FramebufferCacheKey& fbkey, OpenGLState& current_state); + void SetupShaders(GLenum primitive_mode); enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; + OGLFramebuffer clear_framebuffer; + using CachedPageMap = boost::icl::interval_map<u64, int>; CachedPageMap cached_pages; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp deleted file mode 100644 index a7681902e..000000000 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ /dev/null @@ -1,1362 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <optional> -#include <glad/glad.h> - -#include "common/alignment.h" -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "common/scope_exit.h" -#include "core/core.h" -#include "core/hle/kernel/process.h" -#include "core/settings.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/morton.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_rasterizer_cache.h" -#include "video_core/renderer_opengl/utils.h" -#include "video_core/surface.h" -#include "video_core/textures/convert.h" -#include "video_core/textures/decoders.h" - -namespace OpenGL { - -using VideoCore::MortonSwizzle; -using VideoCore::MortonSwizzleMode; -using VideoCore::Surface::ComponentTypeFromDepthFormat; -using VideoCore::Surface::ComponentTypeFromRenderTarget; -using VideoCore::Surface::ComponentTypeFromTexture; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; -using VideoCore::Surface::PixelFormatFromTextureFormat; -using VideoCore::Surface::SurfaceTargetFromTextureType; - -struct FormatTuple { - GLint internal_format; - GLenum format; - GLenum type; - ComponentType component_type; - bool compressed; -}; - -static void ApplyTextureDefaults(GLuint texture, u32 max_mip_level) { - glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, max_mip_level - 1); - if (max_mip_level == 1) { - glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0); - } -} - -void SurfaceParams::InitCacheParameters(GPUVAddr gpu_addr_) { - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - - gpu_addr = gpu_addr_; - host_ptr = memory_manager.GetPointer(gpu_addr_); - size_in_bytes = SizeInBytesRaw(); - - if (IsPixelFormatASTC(pixel_format)) { - // ASTC is uncompressed in software, in emulated as RGBA8 - size_in_bytes_gl = width * height * depth * 4; - } else { - size_in_bytes_gl = SizeInBytesGL(); - } -} - -std::size_t SurfaceParams::InnerMipmapMemorySize(u32 mip_level, bool force_gl, bool layer_only, - bool uncompressed) const { - const u32 tile_x{GetDefaultBlockWidth(pixel_format)}; - const u32 tile_y{GetDefaultBlockHeight(pixel_format)}; - const u32 bytes_per_pixel{GetBytesPerPixel(pixel_format)}; - u32 m_depth = (layer_only ? 1U : depth); - u32 m_width = MipWidth(mip_level); - u32 m_height = MipHeight(mip_level); - m_width = uncompressed ? m_width : std::max(1U, (m_width + tile_x - 1) / tile_x); - m_height = uncompressed ? m_height : std::max(1U, (m_height + tile_y - 1) / tile_y); - m_depth = std::max(1U, m_depth >> mip_level); - u32 m_block_height = MipBlockHeight(mip_level); - u32 m_block_depth = MipBlockDepth(mip_level); - return Tegra::Texture::CalculateSize(force_gl ? false : is_tiled, bytes_per_pixel, m_width, - m_height, m_depth, m_block_height, m_block_depth); -} - -std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only, - bool uncompressed) const { - std::size_t block_size_bytes = Tegra::Texture::GetGOBSize() * block_height * block_depth; - std::size_t size = 0; - for (u32 i = 0; i < max_mip_level; i++) { - size += InnerMipmapMemorySize(i, force_gl, layer_only, uncompressed); - } - if (!force_gl && is_tiled) { - size = Common::AlignUp(size, block_size_bytes); - } - return size; -} - -/*static*/ SurfaceParams SurfaceParams::CreateForTexture( - const Tegra::Texture::FullTextureInfo& config, const GLShader::SamplerEntry& entry) { - SurfaceParams params{}; - params.is_tiled = config.tic.IsTiled(); - params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0, - params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0, - params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0, - params.tile_width_spacing = params.is_tiled ? (1 << config.tic.tile_width_spacing.Value()) : 1; - params.srgb_conversion = config.tic.IsSrgbConversionEnabled(); - params.pixel_format = PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value(), - params.srgb_conversion); - - if (config.tsc.depth_compare_enabled) { - // Some titles create a 'R16U' (normalized 16-bit) texture with depth_compare enabled, - // then attempt to sample from it via a shadow sampler. Convert format to Z16 (which also - // causes GetFormatType to properly return 'Depth' below). - if (GetFormatType(params.pixel_format) == SurfaceType::ColorTexture) { - switch (params.pixel_format) { - case PixelFormat::R16S: - case PixelFormat::R16U: - case PixelFormat::R16F: - params.pixel_format = PixelFormat::Z16; - break; - case PixelFormat::R32F: - params.pixel_format = PixelFormat::Z32F; - break; - default: - LOG_WARNING(HW_GPU, "Color texture format being used with depth compare: {}", - static_cast<u32>(params.pixel_format)); - break; - } - } - } - - params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value()); - params.type = GetFormatType(params.pixel_format); - UNIMPLEMENTED_IF(params.type == SurfaceType::ColorTexture && config.tsc.depth_compare_enabled); - - params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format)); - params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format)); - if (!params.is_tiled) { - params.pitch = config.tic.Pitch(); - } - params.unaligned_height = config.tic.Height(); - params.target = SurfaceTargetFromTextureType(config.tic.texture_type); - params.identity = SurfaceClass::Uploaded; - - switch (params.target) { - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture2D: - params.depth = 1; - break; - case SurfaceTarget::TextureCubemap: - params.depth = config.tic.Depth() * 6; - break; - case SurfaceTarget::Texture3D: - params.depth = config.tic.Depth(); - break; - case SurfaceTarget::Texture2DArray: - params.depth = config.tic.Depth(); - if (!entry.IsArray()) { - // TODO(bunnei): We have seen games re-use a Texture2D as Texture2DArray with depth of - // one, but sample the texture in the shader as if it were not an array texture. This - // probably is valid on hardware, but we still need to write a test to confirm this. In - // emulation, the workaround here is to continue to treat this as a Texture2D. An - // example game that does this is Super Mario Odyssey (in Cloud Kingdom). - ASSERT(params.depth == 1); - params.target = SurfaceTarget::Texture2D; - } - break; - case SurfaceTarget::TextureCubeArray: - params.depth = config.tic.Depth() * 6; - if (!entry.IsArray()) { - ASSERT(params.depth == 6); - params.target = SurfaceTarget::TextureCubemap; - } - break; - default: - LOG_CRITICAL(HW_GPU, "Unknown depth for target={}", static_cast<u32>(params.target)); - UNREACHABLE(); - params.depth = 1; - break; - } - - params.is_layered = SurfaceTargetIsLayered(params.target); - params.is_array = SurfaceTargetIsArray(params.target); - params.max_mip_level = config.tic.max_mip_level + 1; - params.rt = {}; - - params.InitCacheParameters(config.tic.Address()); - - return params; -} - -/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(std::size_t index) { - const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]}; - SurfaceParams params{}; - - params.is_tiled = - config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.block_width = 1 << config.memory_layout.block_width; - params.block_height = 1 << config.memory_layout.block_height; - params.block_depth = 1 << config.memory_layout.block_depth; - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || - config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; - params.component_type = ComponentTypeFromRenderTarget(config.format); - params.type = GetFormatType(params.pixel_format); - if (params.is_tiled) { - params.width = config.width; - } else { - params.pitch = config.width; - const u32 bpp = params.GetFormatBpp() / 8; - params.width = params.pitch / bpp; - } - params.height = config.height; - params.unaligned_height = config.height; - params.target = SurfaceTarget::Texture2D; - params.identity = SurfaceClass::RenderTarget; - params.depth = 1; - params.max_mip_level = 1; - params.is_layered = false; - - // Render target specific parameters, not used for caching - params.rt.index = static_cast<u32>(index); - params.rt.array_mode = config.array_mode; - params.rt.layer_stride = config.layer_stride; - params.rt.volume = config.volume; - params.rt.base_layer = config.base_layer; - - params.InitCacheParameters(config.Address()); - - return params; -} - -/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer( - u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) { - SurfaceParams params{}; - - params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.block_width = 1 << std::min(block_width, 5U); - params.block_height = 1 << std::min(block_height, 5U); - params.block_depth = 1 << std::min(block_depth, 5U); - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromDepthFormat(format); - params.component_type = ComponentTypeFromDepthFormat(format); - params.type = GetFormatType(params.pixel_format); - params.srgb_conversion = false; - params.width = zeta_width; - params.height = zeta_height; - params.unaligned_height = zeta_height; - params.target = SurfaceTarget::Texture2D; - params.identity = SurfaceClass::DepthBuffer; - params.depth = 1; - params.max_mip_level = 1; - params.is_layered = false; - params.rt = {}; - - params.InitCacheParameters(zeta_address); - - return params; -} - -/*static*/ SurfaceParams SurfaceParams::CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config) { - SurfaceParams params{}; - - params.is_tiled = !config.linear; - params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0, - params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0, - params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0, - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || - config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; - params.component_type = ComponentTypeFromRenderTarget(config.format); - params.type = GetFormatType(params.pixel_format); - params.width = config.width; - params.pitch = config.pitch; - params.height = config.height; - params.unaligned_height = config.height; - params.target = SurfaceTarget::Texture2D; - params.identity = SurfaceClass::Copy; - params.depth = 1; - params.max_mip_level = 1; - params.rt = {}; - - params.InitCacheParameters(config.Address()); - - return params; -} - -static constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U - {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false}, // ABGR8S - {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // ABGR8UI - {GL_RGB8, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false}, // B5G6R5U - {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm, - false}, // A2B10G10R10U - {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U - {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // R8U - {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // R8UI - {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false}, // RGBA16F - {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // RGBA16U - {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RGBA16UI - {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float, - false}, // R11FG11FB10F - {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI - {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT1 - {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT23 - {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT45 - {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN1 - {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXN2UNORM - {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM - {GL_COMPRESSED_RGBA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // BC7U - {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, - true}, // BC6H_UF16 - {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, - true}, // BC6H_SF16 - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4 - {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // BGRA8 - {GL_RGBA32F, GL_RGBA, GL_FLOAT, ComponentType::Float, false}, // RGBA32F - {GL_RG32F, GL_RG, GL_FLOAT, ComponentType::Float, false}, // RG32F - {GL_R32F, GL_RED, GL_FLOAT, ComponentType::Float, false}, // R32F - {GL_R16F, GL_RED, GL_HALF_FLOAT, ComponentType::Float, false}, // R16F - {GL_R16, GL_RED, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // R16U - {GL_R16_SNORM, GL_RED, GL_SHORT, ComponentType::SNorm, false}, // R16S - {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // R16UI - {GL_R16I, GL_RED_INTEGER, GL_SHORT, ComponentType::SInt, false}, // R16I - {GL_RG16, GL_RG, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // RG16 - {GL_RG16F, GL_RG, GL_HALF_FLOAT, ComponentType::Float, false}, // RG16F - {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RG16UI - {GL_RG16I, GL_RG_INTEGER, GL_SHORT, ComponentType::SInt, false}, // RG16I - {GL_RG16_SNORM, GL_RG, GL_SHORT, ComponentType::SNorm, false}, // RG16S - {GL_RGB32F, GL_RGB, GL_FLOAT, ComponentType::Float, false}, // RGB32F - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, - false}, // RGBA8_SRGB - {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // RG8U - {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false}, // RG8S - {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RG32UI - {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // R32UI - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8 - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5 - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4 - {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // BGRA8 - // Compressed sRGB formats - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT1_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT23_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // DXT45_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, - true}, // BC7U_SRGB - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4_SRGB - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8_SRGB - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5_SRGB - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4_SRGB - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5 - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5_SRGB - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8 - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8_SRGB - - // Depth formats - {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F - {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, ComponentType::UNorm, - false}, // Z16 - - // DepthStencil formats - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm, - false}, // Z24S8 - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm, - false}, // S8Z24 - {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV, - ComponentType::Float, false}, // Z32FS8 -}}; - -static GLenum SurfaceTargetToGL(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - return GL_TEXTURE_1D; - case SurfaceTarget::Texture2D: - return GL_TEXTURE_2D; - case SurfaceTarget::Texture3D: - return GL_TEXTURE_3D; - case SurfaceTarget::Texture1DArray: - return GL_TEXTURE_1D_ARRAY; - case SurfaceTarget::Texture2DArray: - return GL_TEXTURE_2D_ARRAY; - case SurfaceTarget::TextureCubemap: - return GL_TEXTURE_CUBE_MAP; - case SurfaceTarget::TextureCubeArray: - return GL_TEXTURE_CUBE_MAP_ARRAY; - } - LOG_CRITICAL(Render_OpenGL, "Unimplemented texture target={}", static_cast<u32>(target)); - UNREACHABLE(); - return {}; -} - -static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { - ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); - auto& format = tex_format_tuples[static_cast<unsigned int>(pixel_format)]; - ASSERT(component_type == format.component_type); - - return format; -} - -/// Returns the discrepant array target -constexpr GLenum GetArrayDiscrepantTarget(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - return GL_TEXTURE_1D_ARRAY; - case SurfaceTarget::Texture2D: - return GL_TEXTURE_2D_ARRAY; - case SurfaceTarget::Texture3D: - return GL_NONE; - case SurfaceTarget::Texture1DArray: - return GL_TEXTURE_1D; - case SurfaceTarget::Texture2DArray: - return GL_TEXTURE_2D; - case SurfaceTarget::TextureCubemap: - return GL_TEXTURE_CUBE_MAP_ARRAY; - case SurfaceTarget::TextureCubeArray: - return GL_TEXTURE_CUBE_MAP; - } - return GL_NONE; -} - -Common::Rectangle<u32> SurfaceParams::GetRect(u32 mip_level) const { - u32 actual_height{std::max(1U, unaligned_height >> mip_level)}; - if (IsPixelFormatASTC(pixel_format)) { - // ASTC formats must stop at the ATSC block size boundary - actual_height = Common::AlignDown(actual_height, GetASTCBlockSize(pixel_format).second); - } - return {0, actual_height, MipWidth(mip_level), 0}; -} - -void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params, - std::vector<u8>& gl_buffer, u32 mip_level) { - u32 depth = params.MipDepth(mip_level); - if (params.target == SurfaceTarget::Texture2D) { - // TODO(Blinkhawk): Eliminate this condition once all texture types are implemented. - depth = 1U; - } - if (params.is_layered) { - u64 offset = params.GetMipmapLevelOffset(mip_level); - u64 offset_gl = 0; - const u64 layer_size = params.LayerMemorySize(); - const u64 gl_size = params.LayerSizeGL(mip_level); - for (u32 i = 0; i < params.depth; i++) { - MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level), - params.MipBlockHeight(mip_level), params.MipHeight(mip_level), - params.MipBlockDepth(mip_level), 1, params.tile_width_spacing, - gl_buffer.data() + offset_gl, params.host_ptr + offset); - offset += layer_size; - offset_gl += gl_size; - } - } else { - const u64 offset = params.GetMipmapLevelOffset(mip_level); - MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level), - params.MipBlockHeight(mip_level), params.MipHeight(mip_level), - params.MipBlockDepth(mip_level), depth, params.tile_width_spacing, - gl_buffer.data(), params.host_ptr + offset); - } -} - -void RasterizerCacheOpenGL::FastCopySurface(const Surface& src_surface, - const Surface& dst_surface) { - const auto& src_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - - const u32 width{std::min(src_params.width, dst_params.width)}; - const u32 height{std::min(src_params.height, dst_params.height)}; - - glCopyImageSubData(src_surface->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0, 0, - 0, dst_surface->Texture().handle, SurfaceTargetToGL(dst_params.target), 0, 0, - 0, 0, width, height, 1); - - dst_surface->MarkAsModified(true, *this); -} - -MICROPROFILE_DEFINE(OpenGL_CopySurface, "OpenGL", "CopySurface", MP_RGB(128, 192, 64)); -void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surface& dst_surface, - const GLuint copy_pbo_handle, const GLenum src_attachment, - const GLenum dst_attachment, - const std::size_t cubemap_face) { - MICROPROFILE_SCOPE(OpenGL_CopySurface); - ASSERT_MSG(dst_attachment == 0, "Unimplemented"); - - const auto& src_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - - const auto source_format = GetFormatTuple(src_params.pixel_format, src_params.component_type); - const auto dest_format = GetFormatTuple(dst_params.pixel_format, dst_params.component_type); - - const std::size_t buffer_size = std::max(src_params.size_in_bytes, dst_params.size_in_bytes); - - glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); - glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_COPY); - if (source_format.compressed) { - glGetCompressedTextureImage(src_surface->Texture().handle, src_attachment, - static_cast<GLsizei>(src_params.size_in_bytes), nullptr); - } else { - glGetTextureImage(src_surface->Texture().handle, src_attachment, source_format.format, - source_format.type, static_cast<GLsizei>(src_params.size_in_bytes), - nullptr); - } - // If the new texture is bigger than the previous one, we need to fill in the rest with data - // from the CPU. - if (src_params.size_in_bytes < dst_params.size_in_bytes) { - // Upload the rest of the memory. - if (dst_params.is_tiled) { - // TODO(Subv): We might have to de-tile the subtexture and re-tile it with the rest - // of the data in this case. Games like Super Mario Odyssey seem to hit this case - // when drawing, it re-uses the memory of a previous texture as a bigger framebuffer - // but it doesn't clear it beforehand, the texture is already full of zeros. - LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during " - "reinterpretation but the texture is tiled."); - } - const std::size_t remaining_size = dst_params.size_in_bytes - src_params.size_in_bytes; - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - glBufferSubData(GL_PIXEL_PACK_BUFFER, src_params.size_in_bytes, remaining_size, - memory_manager.GetPointer(dst_params.gpu_addr + src_params.size_in_bytes)); - } - - glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - - const GLsizei width{static_cast<GLsizei>( - std::min(src_params.GetRect().GetWidth(), dst_params.GetRect().GetWidth()))}; - const GLsizei height{static_cast<GLsizei>( - std::min(src_params.GetRect().GetHeight(), dst_params.GetRect().GetHeight()))}; - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo_handle); - if (dest_format.compressed) { - LOG_CRITICAL(HW_GPU, "Compressed copy is unimplemented!"); - UNREACHABLE(); - } else { - switch (dst_params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(dst_surface->Texture().handle, 0, 0, width, dest_format.format, - dest_format.type, nullptr); - break; - case SurfaceTarget::Texture2D: - glTextureSubImage2D(dst_surface->Texture().handle, 0, 0, 0, width, height, - dest_format.format, dest_format.type, nullptr); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D(dst_surface->Texture().handle, 0, 0, 0, 0, width, height, - static_cast<GLsizei>(dst_params.depth), dest_format.format, - dest_format.type, nullptr); - break; - case SurfaceTarget::TextureCubemap: - glTextureSubImage3D(dst_surface->Texture().handle, 0, 0, 0, - static_cast<GLint>(cubemap_face), width, height, 1, - dest_format.format, dest_format.type, nullptr); - break; - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(dst_params.target)); - UNREACHABLE(); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - } - - dst_surface->MarkAsModified(true, *this); -} - -CachedSurface::CachedSurface(const SurfaceParams& params) - : RasterizerCacheObject{params.host_ptr}, params{params}, - gl_target{SurfaceTargetToGL(params.target)}, cached_size_in_bytes{params.size_in_bytes} { - - const auto optional_cpu_addr{ - Core::System::GetInstance().GPU().MemoryManager().GpuToCpuAddress(params.gpu_addr)}; - ASSERT_MSG(optional_cpu_addr, "optional_cpu_addr is invalid"); - cpu_addr = *optional_cpu_addr; - - texture.Create(gl_target); - - // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0) - // alternatives. This signals a bug on those functions. - const auto width = static_cast<GLsizei>(params.MipWidth(0)); - const auto height = static_cast<GLsizei>(params.MipHeight(0)); - memory_size = params.MemorySize(); - reinterpreted = false; - - const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type); - gl_internal_format = format_tuple.internal_format; - - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureStorage1D(texture.handle, params.max_mip_level, format_tuple.internal_format, - width); - break; - case SurfaceTarget::Texture2D: - case SurfaceTarget::TextureCubemap: - glTextureStorage2D(texture.handle, params.max_mip_level, format_tuple.internal_format, - width, height); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureStorage3D(texture.handle, params.max_mip_level, format_tuple.internal_format, - width, height, params.depth); - break; - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(params.target)); - UNREACHABLE(); - glTextureStorage2D(texture.handle, params.max_mip_level, format_tuple.internal_format, - width, height); - } - - ApplyTextureDefaults(texture.handle, params.max_mip_level); - - OpenGL::LabelGLObject(GL_TEXTURE, texture.handle, params.gpu_addr, params.IdentityString()); -} - -MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64)); -void CachedSurface::LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) { - MICROPROFILE_SCOPE(OpenGL_SurfaceLoad); - auto& gl_buffer = res_cache_tmp_mem.gl_buffer; - if (gl_buffer.size() < params.max_mip_level) - gl_buffer.resize(params.max_mip_level); - for (u32 i = 0; i < params.max_mip_level; i++) - gl_buffer[i].resize(params.GetMipmapSizeGL(i)); - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", - params.block_width, static_cast<u32>(params.target)); - for (u32 i = 0; i < params.max_mip_level; i++) - SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i); - } else { - const u32 bpp = params.GetFormatBpp() / 8; - const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) / - GetDefaultBlockWidth(params.pixel_format); - if (params.pitch == copy_size) { - std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl); - } else { - const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) / - GetDefaultBlockHeight(params.pixel_format); - const u8* start{params.host_ptr}; - u8* write_to = gl_buffer[0].data(); - for (u32 h = height; h > 0; h--) { - std::memcpy(write_to, start, copy_size); - start += params.pitch; - write_to += copy_size; - } - } - } - for (u32 i = 0; i < params.max_mip_level; i++) { - const u32 width = params.MipWidth(i); - const u32 height = params.MipHeight(i); - const u32 depth = params.MipDepth(i); - if (VideoCore::Surface::IsPixelFormatASTC(params.pixel_format)) { - // Reserve size for RGBA8 conversion - constexpr std::size_t rgba_bpp = 4; - gl_buffer[i].resize(std::max(gl_buffer[i].size(), width * height * depth * rgba_bpp)); - } - Tegra::Texture::ConvertFromGuestToHost(gl_buffer[i].data(), params.pixel_format, width, - height, depth, true, true); - } -} - -MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64)); -void CachedSurface::FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) { - MICROPROFILE_SCOPE(OpenGL_SurfaceFlush); - - ASSERT_MSG(!IsPixelFormatASTC(params.pixel_format), "Unimplemented"); - - auto& gl_buffer = res_cache_tmp_mem.gl_buffer; - // OpenGL temporary buffer needs to be big enough to store raw texture size - gl_buffer[0].resize(GetSizeInBytes()); - - const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - const u32 align = std::clamp(params.RowAlign(0), 1U, 8U); - glPixelStorei(GL_PACK_ALIGNMENT, align); - glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width)); - ASSERT(!tuple.compressed); - glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - glGetTextureImage(texture.handle, 0, tuple.format, tuple.type, - static_cast<GLsizei>(gl_buffer[0].size()), gl_buffer[0].data()); - glPixelStorei(GL_PACK_ROW_LENGTH, 0); - Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width, - params.height, params.depth, true, true); - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", - params.block_width, static_cast<u32>(params.target)); - - SwizzleFunc(MortonSwizzleMode::LinearToMorton, params, gl_buffer[0], 0); - } else { - const u32 bpp = params.GetFormatBpp() / 8; - const u32 copy_size = params.width * bpp; - if (params.pitch == copy_size) { - std::memcpy(params.host_ptr, gl_buffer[0].data(), GetSizeInBytes()); - } else { - u8* start{params.host_ptr}; - const u8* read_to = gl_buffer[0].data(); - for (u32 h = params.height; h > 0; h--) { - std::memcpy(start, read_to, copy_size); - start += params.pitch; - read_to += copy_size; - } - } - } -} - -void CachedSurface::UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map, - GLuint read_fb_handle, GLuint draw_fb_handle) { - const auto& rect{params.GetRect(mip_map)}; - - auto& gl_buffer = res_cache_tmp_mem.gl_buffer; - - // Load data from memory to the surface - const auto x0 = static_cast<GLint>(rect.left); - const auto y0 = static_cast<GLint>(rect.bottom); - auto buffer_offset = - static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.MipWidth(mip_map) + - static_cast<std::size_t>(x0)) * - GetBytesPerPixel(params.pixel_format); - - const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - - const u32 align = std::clamp(params.RowAlign(mip_map), 1U, 8U); - glPixelStorei(GL_UNPACK_ALIGNMENT, align); - glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.MipWidth(mip_map))); - - const auto image_size = static_cast<GLsizei>(params.GetMipmapSizeGL(mip_map, false)); - if (tuple.compressed) { - switch (params.target) { - case SurfaceTarget::Texture2D: - glCompressedTextureSubImage2D( - texture.handle, mip_map, 0, 0, static_cast<GLsizei>(params.MipWidth(mip_map)), - static_cast<GLsizei>(params.MipHeight(mip_map)), tuple.internal_format, image_size, - &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::Texture3D: - glCompressedTextureSubImage3D( - texture.handle, mip_map, 0, 0, 0, static_cast<GLsizei>(params.MipWidth(mip_map)), - static_cast<GLsizei>(params.MipHeight(mip_map)), - static_cast<GLsizei>(params.MipDepth(mip_map)), tuple.internal_format, image_size, - &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glCompressedTextureSubImage3D( - texture.handle, mip_map, 0, 0, 0, static_cast<GLsizei>(params.MipWidth(mip_map)), - static_cast<GLsizei>(params.MipHeight(mip_map)), static_cast<GLsizei>(params.depth), - tuple.internal_format, image_size, &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::TextureCubemap: { - const auto layer_size = static_cast<GLsizei>(params.LayerSizeGL(mip_map)); - for (std::size_t face = 0; face < params.depth; ++face) { - glCompressedTextureSubImage3D( - texture.handle, mip_map, 0, 0, static_cast<GLint>(face), - static_cast<GLsizei>(params.MipWidth(mip_map)), - static_cast<GLsizei>(params.MipHeight(mip_map)), 1, tuple.internal_format, - layer_size, &gl_buffer[mip_map][buffer_offset]); - buffer_offset += layer_size; - } - break; - } - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(params.target)); - UNREACHABLE(); - glCompressedTextureSubImage2D( - texture.handle, mip_map, 0, 0, static_cast<GLsizei>(params.MipWidth(mip_map)), - static_cast<GLsizei>(params.MipHeight(mip_map)), tuple.internal_format, - static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[mip_map][buffer_offset]); - } - } else { - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(texture.handle, mip_map, x0, static_cast<GLsizei>(rect.GetWidth()), - tuple.format, tuple.type, &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::Texture2D: - glTextureSubImage2D(texture.handle, mip_map, x0, y0, - static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::Texture3D: - glTextureSubImage3D(texture.handle, mip_map, x0, y0, 0, - static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), params.MipDepth(mip_map), - tuple.format, tuple.type, &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D(texture.handle, mip_map, x0, y0, 0, - static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), params.depth, tuple.format, - tuple.type, &gl_buffer[mip_map][buffer_offset]); - break; - case SurfaceTarget::TextureCubemap: { - for (std::size_t face = 0; face < params.depth; ++face) { - glTextureSubImage3D(texture.handle, mip_map, x0, y0, static_cast<GLint>(face), - static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), 1, tuple.format, - tuple.type, &gl_buffer[mip_map][buffer_offset]); - buffer_offset += params.LayerSizeGL(mip_map); - } - break; - } - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(params.target)); - UNREACHABLE(); - glTextureSubImage2D(texture.handle, mip_map, x0, y0, - static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[mip_map][buffer_offset]); - } - } - - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); -} - -void CachedSurface::EnsureTextureDiscrepantView() { - if (discrepant_view.handle != 0) - return; - - const GLenum target{GetArrayDiscrepantTarget(params.target)}; - ASSERT(target != GL_NONE); - - const GLuint num_layers{target == GL_TEXTURE_CUBE_MAP_ARRAY ? 6u : 1u}; - constexpr GLuint min_layer = 0; - constexpr GLuint min_level = 0; - - glGenTextures(1, &discrepant_view.handle); - glTextureView(discrepant_view.handle, target, texture.handle, gl_internal_format, min_level, - params.max_mip_level, min_layer, num_layers); - ApplyTextureDefaults(discrepant_view.handle, params.max_mip_level); - glTextureParameteriv(discrepant_view.handle, GL_TEXTURE_SWIZZLE_RGBA, - reinterpret_cast<const GLint*>(swizzle.data())); -} - -MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 192, 64)); -void CachedSurface::UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, - GLuint read_fb_handle, GLuint draw_fb_handle) { - MICROPROFILE_SCOPE(OpenGL_TextureUL); - - for (u32 i = 0; i < params.max_mip_level; i++) - UploadGLMipmapTexture(res_cache_tmp_mem, i, read_fb_handle, draw_fb_handle); -} - -void CachedSurface::UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x, - Tegra::Texture::SwizzleSource swizzle_y, - Tegra::Texture::SwizzleSource swizzle_z, - Tegra::Texture::SwizzleSource swizzle_w) { - const GLenum new_x = MaxwellToGL::SwizzleSource(swizzle_x); - const GLenum new_y = MaxwellToGL::SwizzleSource(swizzle_y); - const GLenum new_z = MaxwellToGL::SwizzleSource(swizzle_z); - const GLenum new_w = MaxwellToGL::SwizzleSource(swizzle_w); - if (swizzle[0] == new_x && swizzle[1] == new_y && swizzle[2] == new_z && swizzle[3] == new_w) { - return; - } - swizzle = {new_x, new_y, new_z, new_w}; - const auto swizzle_data = reinterpret_cast<const GLint*>(swizzle.data()); - glTextureParameteriv(texture.handle, GL_TEXTURE_SWIZZLE_RGBA, swizzle_data); - if (discrepant_view.handle != 0) { - glTextureParameteriv(discrepant_view.handle, GL_TEXTURE_SWIZZLE_RGBA, swizzle_data); - } -} - -RasterizerCacheOpenGL::RasterizerCacheOpenGL(RasterizerOpenGL& rasterizer) - : RasterizerCache{rasterizer} { - read_framebuffer.Create(); - draw_framebuffer.Create(); - copy_pbo.Create(); -} - -Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextureInfo& config, - const GLShader::SamplerEntry& entry) { - return GetSurface(SurfaceParams::CreateForTexture(config, entry)); -} - -Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) { - auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()}; - const auto& regs{gpu.regs}; - - if (!gpu.dirty_flags.zeta_buffer) { - return last_depth_buffer; - } - gpu.dirty_flags.zeta_buffer = false; - - if (!regs.zeta.Address() || !regs.zeta_enable) { - return last_depth_buffer = {}; - } - - SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer( - regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format, - regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height, - regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)}; - - return last_depth_buffer = GetSurface(depth_params, preserve_contents); -} - -Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool preserve_contents) { - auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()}; - const auto& regs{gpu.regs}; - - if (!gpu.dirty_flags.color_buffer[index]) { - return current_color_buffers[index]; - } - gpu.dirty_flags.color_buffer.reset(index); - - ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); - - if (index >= regs.rt_control.count) { - return current_color_buffers[index] = {}; - } - - if (regs.rt[index].Address() == 0 || regs.rt[index].format == Tegra::RenderTargetFormat::NONE) { - return current_color_buffers[index] = {}; - } - - const SurfaceParams color_params{SurfaceParams::CreateForFramebuffer(index)}; - - return current_color_buffers[index] = GetSurface(color_params, preserve_contents); -} - -void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) { - surface->LoadGLBuffer(temporal_memory); - surface->UploadGLTexture(temporal_memory, read_framebuffer.handle, draw_framebuffer.handle); - surface->MarkAsModified(false, *this); - surface->MarkForReload(false); -} - -Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) { - if (!params.IsValid()) { - return {}; - } - - // Look up surface in the cache based on address - Surface surface{TryGet(params.host_ptr)}; - if (surface) { - if (surface->GetSurfaceParams().IsCompatibleSurface(params)) { - // Use the cached surface as-is unless it's not synced with memory - if (surface->MustReload()) - LoadSurface(surface); - return surface; - } else if (preserve_contents) { - // If surface parameters changed and we care about keeping the previous data, recreate - // the surface from the old one - Surface new_surface{RecreateSurface(surface, params)}; - Unregister(surface); - Register(new_surface); - if (new_surface->IsUploaded()) { - RegisterReinterpretSurface(new_surface); - } - return new_surface; - } else { - // Delete the old surface before creating a new one to prevent collisions. - Unregister(surface); - } - } - - // No cached surface found - get a new one - surface = GetUncachedSurface(params); - Register(surface); - - // Only load surface from memory if we care about the contents - if (preserve_contents) { - LoadSurface(surface); - } - - return surface; -} - -Surface RasterizerCacheOpenGL::GetUncachedSurface(const SurfaceParams& params) { - Surface surface{TryGetReservedSurface(params)}; - if (!surface) { - // No reserved surface available, create a new one and reserve it - surface = std::make_shared<CachedSurface>(params); - ReserveSurface(surface); - } - return surface; -} - -void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface, - const Surface& dst_surface) { - const auto& init_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - GPUVAddr address{init_params.gpu_addr}; - const std::size_t layer_size{dst_params.LayerMemorySize()}; - for (u32 layer = 0; layer < dst_params.depth; layer++) { - for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) { - const GPUVAddr sub_address{address + dst_params.GetMipmapLevelOffset(mipmap)}; - const Surface& copy{TryGet(memory_manager.GetPointer(sub_address))}; - if (!copy) { - continue; - } - const auto& src_params{copy->GetSurfaceParams()}; - const u32 width{std::min(src_params.width, dst_params.MipWidth(mipmap))}; - const u32 height{std::min(src_params.height, dst_params.MipHeight(mipmap))}; - - glCopyImageSubData(copy->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0, - 0, 0, dst_surface->Texture().handle, - SurfaceTargetToGL(dst_params.target), mipmap, 0, 0, layer, width, - height, 1); - } - address += layer_size; - } - - dst_surface->MarkAsModified(true, *this); -} - -static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface, - const Common::Rectangle<u32>& src_rect, - const Common::Rectangle<u32>& dst_rect, GLuint read_fb_handle, - GLuint draw_fb_handle, GLenum src_attachment = 0, GLenum dst_attachment = 0, - std::size_t cubemap_face = 0) { - - const auto& src_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ prev_state.Apply(); }); - - OpenGLState state; - state.draw.read_framebuffer = read_fb_handle; - state.draw.draw_framebuffer = draw_fb_handle; - state.Apply(); - - u32 buffers{}; - - if (src_params.type == SurfaceType::ColorTexture) { - switch (src_params.target) { - case SurfaceTarget::Texture2D: - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - case SurfaceTarget::TextureCubemap: - glFramebufferTexture2D( - GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), - src_surface->Texture().handle, 0); - glFramebufferTexture2D( - GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0); - break; - case SurfaceTarget::Texture2DArray: - glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - src_surface->Texture().handle, 0, 0); - glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0); - break; - case SurfaceTarget::Texture3D: - glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - SurfaceTargetToGL(src_params.target), - src_surface->Texture().handle, 0, 0); - glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - SurfaceTargetToGL(src_params.target), 0, 0, 0); - break; - default: - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - } - - switch (dst_params.target) { - case SurfaceTarget::Texture2D: - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - case SurfaceTarget::TextureCubemap: - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), - dst_surface->Texture().handle, 0); - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0); - break; - case SurfaceTarget::Texture2DArray: - glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - dst_surface->Texture().handle, 0, 0); - glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0); - break; - - case SurfaceTarget::Texture3D: - glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - SurfaceTargetToGL(dst_params.target), - dst_surface->Texture().handle, 0, 0); - glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - SurfaceTargetToGL(dst_params.target), 0, 0, 0); - break; - default: - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - } - - buffers = GL_COLOR_BUFFER_BIT; - } else if (src_params.type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - - buffers = GL_DEPTH_BUFFER_BIT; - } else if (src_params.type == SurfaceType::DepthStencil) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - src_surface->Texture().handle, 0); - - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->Texture().handle, 0); - - buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; - } - - glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left, - dst_rect.top, dst_rect.right, dst_rect.bottom, buffers, - buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST); - - return true; -} - -void RasterizerCacheOpenGL::FermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& src_config, - const Tegra::Engines::Fermi2D::Regs::Surface& dst_config, - const Common::Rectangle<u32>& src_rect, const Common::Rectangle<u32>& dst_rect) { - - const auto& src_params = SurfaceParams::CreateForFermiCopySurface(src_config); - const auto& dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config); - - ASSERT(src_params.pixel_format == dst_params.pixel_format); - ASSERT(src_params.block_height == dst_params.block_height); - ASSERT(src_params.is_tiled == dst_params.is_tiled); - ASSERT(src_params.depth == dst_params.depth); - ASSERT(src_params.target == dst_params.target); - ASSERT(src_params.rt.index == dst_params.rt.index); - - auto src_surface = GetSurface(src_params, true); - auto dst_surface = GetSurface(dst_params, true); - - BlitSurface(src_surface, dst_surface, src_rect, dst_rect, read_framebuffer.handle, - draw_framebuffer.handle); - - dst_surface->MarkAsModified(true, *this); -} - -void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface, - const Surface& dst_surface) { - const auto& src_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - - // Flush enough memory for both the source and destination surface - FlushRegion(ToCacheAddr(src_params.host_ptr), - std::max(src_params.MemorySize(), dst_params.MemorySize())); - - LoadSurface(dst_surface); -} - -Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, - const SurfaceParams& new_params) { - // Verify surface is compatible for blitting - auto old_params{old_surface->GetSurfaceParams()}; - - // Get a new surface with the new parameters, and blit the previous surface to it - Surface new_surface{GetUncachedSurface(new_params)}; - - // With use_accurate_gpu_emulation enabled, do an accurate surface copy - if (Settings::values.use_accurate_gpu_emulation) { - AccurateCopySurface(old_surface, new_surface); - return new_surface; - } - - const bool old_compressed = - GetFormatTuple(old_params.pixel_format, old_params.component_type).compressed; - const bool new_compressed = - GetFormatTuple(new_params.pixel_format, new_params.component_type).compressed; - const bool compatible_formats = - GetFormatBpp(old_params.pixel_format) == GetFormatBpp(new_params.pixel_format) && - !(old_compressed || new_compressed); - // For compatible surfaces, we can just do fast glCopyImageSubData based copy - if (old_params.target == new_params.target && old_params.depth == new_params.depth && - old_params.depth == 1 && compatible_formats) { - FastCopySurface(old_surface, new_surface); - return new_surface; - } - - switch (new_params.target) { - case SurfaceTarget::Texture2D: - CopySurface(old_surface, new_surface, copy_pbo.handle); - break; - case SurfaceTarget::Texture3D: - AccurateCopySurface(old_surface, new_surface); - break; - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - if (compatible_formats) - FastLayeredCopySurface(old_surface, new_surface); - else { - AccurateCopySurface(old_surface, new_surface); - } - break; - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(new_params.target)); - UNREACHABLE(); - } - - return new_surface; -} - -Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(const u8* host_ptr) const { - return TryGet(host_ptr); -} - -void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) { - const auto& surface_reserve_key{SurfaceReserveKey::Create(surface->GetSurfaceParams())}; - surface_reserve[surface_reserve_key] = surface; -} - -Surface RasterizerCacheOpenGL::TryGetReservedSurface(const SurfaceParams& params) { - const auto& surface_reserve_key{SurfaceReserveKey::Create(params)}; - auto search{surface_reserve.find(surface_reserve_key)}; - if (search != surface_reserve.end()) { - return search->second; - } - return {}; -} - -static std::optional<u32> TryFindBestMipMap(std::size_t memory, const SurfaceParams params, - u32 height) { - for (u32 i = 0; i < params.max_mip_level; i++) { - if (memory == params.GetMipmapSingleSize(i) && params.MipHeight(i) == height) { - return {i}; - } - } - return {}; -} - -static std::optional<u32> TryFindBestLayer(GPUVAddr addr, const SurfaceParams params, u32 mipmap) { - const std::size_t size{params.LayerMemorySize()}; - GPUVAddr start{params.gpu_addr + params.GetMipmapLevelOffset(mipmap)}; - for (u32 i = 0; i < params.depth; i++) { - if (start == addr) { - return {i}; - } - start += size; - } - return {}; -} - -static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surface render_surface, - const Surface blitted_surface) { - const auto& dst_params = blitted_surface->GetSurfaceParams(); - const auto& src_params = render_surface->GetSurfaceParams(); - const std::size_t src_memory_size = src_params.size_in_bytes; - const std::optional<u32> level = - TryFindBestMipMap(src_memory_size, dst_params, src_params.height); - if (level.has_value()) { - if (src_params.width == dst_params.MipWidthGobAligned(*level) && - src_params.height == dst_params.MipHeight(*level) && - src_params.block_height >= dst_params.MipBlockHeight(*level)) { - const std::optional<u32> slot = - TryFindBestLayer(render_surface->GetSurfaceParams().gpu_addr, dst_params, *level); - if (slot.has_value()) { - glCopyImageSubData(render_surface->Texture().handle, - SurfaceTargetToGL(src_params.target), 0, 0, 0, 0, - blitted_surface->Texture().handle, - SurfaceTargetToGL(dst_params.target), *level, 0, 0, *slot, - dst_params.MipWidth(*level), dst_params.MipHeight(*level), 1); - blitted_surface->MarkAsModified(true, cache); - return true; - } - } - } - return false; -} - -static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) { - const VAddr bound1 = blitted_surface->GetCpuAddr() + blitted_surface->GetMemorySize(); - const VAddr bound2 = render_surface->GetCpuAddr() + render_surface->GetMemorySize(); - if (bound2 > bound1) - return true; - const auto& dst_params = blitted_surface->GetSurfaceParams(); - const auto& src_params = render_surface->GetSurfaceParams(); - return (dst_params.component_type != src_params.component_type); -} - -static bool IsReinterpretInvalidSecond(const Surface render_surface, - const Surface blitted_surface) { - const auto& dst_params = blitted_surface->GetSurfaceParams(); - const auto& src_params = render_surface->GetSurfaceParams(); - return (dst_params.height > src_params.height && dst_params.width > src_params.width); -} - -bool RasterizerCacheOpenGL::PartialReinterpretSurface(Surface triggering_surface, - Surface intersect) { - if (IsReinterpretInvalid(triggering_surface, intersect)) { - Unregister(intersect); - return false; - } - if (!LayerFitReinterpretSurface(*this, triggering_surface, intersect)) { - if (IsReinterpretInvalidSecond(triggering_surface, intersect)) { - Unregister(intersect); - return false; - } - FlushObject(intersect); - FlushObject(triggering_surface); - intersect->MarkForReload(true); - } - return true; -} - -void RasterizerCacheOpenGL::SignalPreDrawCall() { - if (texception && GLAD_GL_ARB_texture_barrier) { - glTextureBarrier(); - } - texception = false; -} - -void RasterizerCacheOpenGL::SignalPostDrawCall() { - for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) { - if (current_color_buffers[i] != nullptr) { - Surface intersect = - CollideOnReinterpretedSurface(current_color_buffers[i]->GetCacheAddr()); - if (intersect != nullptr) { - PartialReinterpretSurface(current_color_buffers[i], intersect); - texception = true; - } - } - } -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h deleted file mode 100644 index 6263ef3e7..000000000 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ /dev/null @@ -1,572 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <memory> -#include <string> -#include <tuple> -#include <vector> - -#include "common/alignment.h" -#include "common/bit_util.h" -#include "common/common_types.h" -#include "common/hash.h" -#include "common/math_util.h" -#include "video_core/engines/fermi_2d.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/surface.h" -#include "video_core/textures/decoders.h" -#include "video_core/textures/texture.h" - -namespace OpenGL { - -class CachedSurface; -using Surface = std::shared_ptr<CachedSurface>; -using SurfaceSurfaceRect_Tuple = std::tuple<Surface, Surface, Common::Rectangle<u32>>; - -using SurfaceTarget = VideoCore::Surface::SurfaceTarget; -using SurfaceType = VideoCore::Surface::SurfaceType; -using PixelFormat = VideoCore::Surface::PixelFormat; -using ComponentType = VideoCore::Surface::ComponentType; -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - -struct SurfaceParams { - enum class SurfaceClass { - Uploaded, - RenderTarget, - DepthBuffer, - Copy, - }; - - static std::string SurfaceTargetName(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - return "Texture1D"; - case SurfaceTarget::Texture2D: - return "Texture2D"; - case SurfaceTarget::Texture3D: - return "Texture3D"; - case SurfaceTarget::Texture1DArray: - return "Texture1DArray"; - case SurfaceTarget::Texture2DArray: - return "Texture2DArray"; - case SurfaceTarget::TextureCubemap: - return "TextureCubemap"; - case SurfaceTarget::TextureCubeArray: - return "TextureCubeArray"; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); - UNREACHABLE(); - return fmt::format("TextureUnknown({})", static_cast<u32>(target)); - } - } - - u32 GetFormatBpp() const { - return VideoCore::Surface::GetFormatBpp(pixel_format); - } - - /// Returns the rectangle corresponding to this surface - Common::Rectangle<u32> GetRect(u32 mip_level = 0) const; - - /// Returns the total size of this surface in bytes, adjusted for compression - std::size_t SizeInBytesRaw(bool ignore_tiled = false) const { - const u32 compression_factor{GetCompressionFactor(pixel_format)}; - const u32 bytes_per_pixel{GetBytesPerPixel(pixel_format)}; - const size_t uncompressed_size{ - Tegra::Texture::CalculateSize((ignore_tiled ? false : is_tiled), bytes_per_pixel, width, - height, depth, block_height, block_depth)}; - - // Divide by compression_factor^2, as height and width are factored by this - return uncompressed_size / (compression_factor * compression_factor); - } - - /// Returns the size of this surface as an OpenGL texture in bytes - std::size_t SizeInBytesGL() const { - return SizeInBytesRaw(true); - } - - /// Returns the size of this surface as a cube face in bytes - std::size_t SizeInBytesCubeFace() const { - return size_in_bytes / 6; - } - - /// Returns the size of this surface as an OpenGL cube face in bytes - std::size_t SizeInBytesCubeFaceGL() const { - return size_in_bytes_gl / 6; - } - - /// Returns the exact size of memory occupied by the texture in VRAM, including mipmaps. - std::size_t MemorySize() const { - std::size_t size = InnerMemorySize(false, is_layered); - if (is_layered) - return size * depth; - return size; - } - - /// Returns true if the parameters constitute a valid rasterizer surface. - bool IsValid() const { - return gpu_addr && host_ptr && height && width; - } - - /// Returns the exact size of the memory occupied by a layer in a texture in VRAM, including - /// mipmaps. - std::size_t LayerMemorySize() const { - return InnerMemorySize(false, true); - } - - /// Returns the size of a layer of this surface in OpenGL. - std::size_t LayerSizeGL(u32 mip_level) const { - return InnerMipmapMemorySize(mip_level, true, is_layered, false); - } - - std::size_t GetMipmapSizeGL(u32 mip_level, bool ignore_compressed = true) const { - std::size_t size = InnerMipmapMemorySize(mip_level, true, is_layered, ignore_compressed); - if (is_layered) - return size * depth; - return size; - } - - std::size_t GetMipmapLevelOffset(u32 mip_level) const { - std::size_t offset = 0; - for (u32 i = 0; i < mip_level; i++) - offset += InnerMipmapMemorySize(i, false, is_layered); - return offset; - } - - std::size_t GetMipmapLevelOffsetGL(u32 mip_level) const { - std::size_t offset = 0; - for (u32 i = 0; i < mip_level; i++) - offset += InnerMipmapMemorySize(i, true, is_layered); - return offset; - } - - std::size_t GetMipmapSingleSize(u32 mip_level) const { - return InnerMipmapMemorySize(mip_level, false, is_layered); - } - - u32 MipWidth(u32 mip_level) const { - return std::max(1U, width >> mip_level); - } - - u32 MipWidthGobAligned(u32 mip_level) const { - return Common::AlignUp(std::max(1U, width >> mip_level), 64U * 8U / GetFormatBpp()); - } - - u32 MipHeight(u32 mip_level) const { - return std::max(1U, height >> mip_level); - } - - u32 MipDepth(u32 mip_level) const { - return is_layered ? depth : std::max(1U, depth >> mip_level); - } - - // Auto block resizing algorithm from: - // https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_miptree.c - u32 MipBlockHeight(u32 mip_level) const { - if (mip_level == 0) - return block_height; - u32 alt_height = MipHeight(mip_level); - u32 h = GetDefaultBlockHeight(pixel_format); - u32 blocks_in_y = (alt_height + h - 1) / h; - u32 bh = 16; - while (bh > 1 && blocks_in_y <= bh * 4) { - bh >>= 1; - } - return bh; - } - - u32 MipBlockDepth(u32 mip_level) const { - if (mip_level == 0) { - return block_depth; - } - - if (is_layered) { - return 1; - } - - const u32 mip_depth = MipDepth(mip_level); - u32 bd = 32; - while (bd > 1 && mip_depth * 2 <= bd) { - bd >>= 1; - } - - if (bd == 32) { - const u32 bh = MipBlockHeight(mip_level); - if (bh >= 4) { - return 16; - } - } - - return bd; - } - - u32 RowAlign(u32 mip_level) const { - const u32 m_width = MipWidth(mip_level); - const u32 bytes_per_pixel = GetBytesPerPixel(pixel_format); - const u32 l2 = Common::CountTrailingZeroes32(m_width * bytes_per_pixel); - return (1U << l2); - } - - /// Creates SurfaceParams from a texture configuration - static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config, - const GLShader::SamplerEntry& entry); - - /// Creates SurfaceParams from a framebuffer configuration - static SurfaceParams CreateForFramebuffer(std::size_t index); - - /// Creates SurfaceParams for a depth buffer configuration - static SurfaceParams CreateForDepthBuffer( - u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type); - - /// Creates SurfaceParams for a Fermi2D surface copy - static SurfaceParams CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config); - - /// Checks if surfaces are compatible for caching - bool IsCompatibleSurface(const SurfaceParams& other) const { - if (std::tie(pixel_format, type, width, height, target, depth, is_tiled) == - std::tie(other.pixel_format, other.type, other.width, other.height, other.target, - other.depth, other.is_tiled)) { - if (!is_tiled) - return true; - return std::tie(block_height, block_depth, tile_width_spacing) == - std::tie(other.block_height, other.block_depth, other.tile_width_spacing); - } - return false; - } - - /// Initializes parameters for caching, should be called after everything has been initialized - void InitCacheParameters(GPUVAddr gpu_addr); - - std::string TargetName() const { - switch (target) { - case SurfaceTarget::Texture1D: - return "1D"; - case SurfaceTarget::Texture2D: - return "2D"; - case SurfaceTarget::Texture3D: - return "3D"; - case SurfaceTarget::Texture1DArray: - return "1DArray"; - case SurfaceTarget::Texture2DArray: - return "2DArray"; - case SurfaceTarget::TextureCubemap: - return "Cube"; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); - UNREACHABLE(); - return fmt::format("TUK({})", static_cast<u32>(target)); - } - } - - std::string ClassName() const { - switch (identity) { - case SurfaceClass::Uploaded: - return "UP"; - case SurfaceClass::RenderTarget: - return "RT"; - case SurfaceClass::DepthBuffer: - return "DB"; - case SurfaceClass::Copy: - return "CP"; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_class={}", static_cast<u32>(identity)); - UNREACHABLE(); - return fmt::format("CUK({})", static_cast<u32>(identity)); - } - } - - std::string IdentityString() const { - return ClassName() + '_' + TargetName() + '_' + (is_tiled ? 'T' : 'L'); - } - - bool is_tiled; - u32 block_width; - u32 block_height; - u32 block_depth; - u32 tile_width_spacing; - PixelFormat pixel_format; - ComponentType component_type; - SurfaceType type; - u32 width; - u32 height; - u32 depth; - u32 unaligned_height; - u32 pitch; - SurfaceTarget target; - SurfaceClass identity; - u32 max_mip_level; - bool is_layered; - bool is_array; - bool srgb_conversion; - // Parameters used for caching - u8* host_ptr; - GPUVAddr gpu_addr; - std::size_t size_in_bytes; - std::size_t size_in_bytes_gl; - - // Render target specific parameters, not used in caching - struct { - u32 index; - u32 array_mode; - u32 volume; - u32 layer_stride; - u32 base_layer; - } rt; - -private: - std::size_t InnerMipmapMemorySize(u32 mip_level, bool force_gl = false, bool layer_only = false, - bool uncompressed = false) const; - std::size_t InnerMemorySize(bool force_gl = false, bool layer_only = false, - bool uncompressed = false) const; -}; - -}; // namespace OpenGL - -/// Hashable variation of SurfaceParams, used for a key in the surface cache -struct SurfaceReserveKey : Common::HashableStruct<OpenGL::SurfaceParams> { - static SurfaceReserveKey Create(const OpenGL::SurfaceParams& params) { - SurfaceReserveKey res; - res.state = params; - res.state.identity = {}; // Ignore the origin of the texture - res.state.gpu_addr = {}; // Ignore GPU vaddr in caching - res.state.rt = {}; // Ignore rt config in caching - return res; - } -}; -namespace std { -template <> -struct hash<SurfaceReserveKey> { - std::size_t operator()(const SurfaceReserveKey& k) const { - return k.Hash(); - } -}; -} // namespace std - -namespace OpenGL { - -class RasterizerOpenGL; - -// This is used to store temporary big buffers, -// instead of creating/destroying all the time -struct RasterizerTemporaryMemory { - std::vector<std::vector<u8>> gl_buffer; -}; - -class CachedSurface final : public RasterizerCacheObject { -public: - explicit CachedSurface(const SurfaceParams& params); - - VAddr GetCpuAddr() const override { - return cpu_addr; - } - - std::size_t GetSizeInBytes() const override { - return cached_size_in_bytes; - } - - std::size_t GetMemorySize() const { - return memory_size; - } - - const OGLTexture& Texture() const { - return texture; - } - - const OGLTexture& Texture(bool as_array) { - if (params.is_array == as_array) { - return texture; - } else { - EnsureTextureDiscrepantView(); - return discrepant_view; - } - } - - GLenum Target() const { - return gl_target; - } - - const SurfaceParams& GetSurfaceParams() const { - return params; - } - - // Read/Write data in Switch memory to/from gl_buffer - void LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem); - void FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem); - - // Upload data in gl_buffer to this surface's texture - void UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, GLuint read_fb_handle, - GLuint draw_fb_handle); - - void UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x, - Tegra::Texture::SwizzleSource swizzle_y, - Tegra::Texture::SwizzleSource swizzle_z, - Tegra::Texture::SwizzleSource swizzle_w); - - void MarkReinterpreted() { - reinterpreted = true; - } - - bool IsReinterpreted() const { - return reinterpreted; - } - - void MarkForReload(bool reload) { - must_reload = reload; - } - - bool MustReload() const { - return must_reload; - } - - bool IsUploaded() const { - return params.identity == SurfaceParams::SurfaceClass::Uploaded; - } - -private: - void UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map, - GLuint read_fb_handle, GLuint draw_fb_handle); - - void EnsureTextureDiscrepantView(); - - OGLTexture texture; - OGLTexture discrepant_view; - SurfaceParams params{}; - GLenum gl_target{}; - GLenum gl_internal_format{}; - std::size_t cached_size_in_bytes{}; - std::array<GLenum, 4> swizzle{GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA}; - std::size_t memory_size; - bool reinterpreted = false; - bool must_reload = false; - VAddr cpu_addr{}; -}; - -class RasterizerCacheOpenGL final : public RasterizerCache<Surface> { -public: - explicit RasterizerCacheOpenGL(RasterizerOpenGL& rasterizer); - - /// Get a surface based on the texture configuration - Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config, - const GLShader::SamplerEntry& entry); - - /// Get the depth surface based on the framebuffer configuration - Surface GetDepthBufferSurface(bool preserve_contents); - - /// Get the color surface based on the framebuffer configuration and the specified render target - Surface GetColorBufferSurface(std::size_t index, bool preserve_contents); - - /// Tries to find a framebuffer using on the provided CPU address - Surface TryFindFramebufferSurface(const u8* host_ptr) const; - - /// Copies the contents of one surface to another - void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config, - const Tegra::Engines::Fermi2D::Regs::Surface& dst_config, - const Common::Rectangle<u32>& src_rect, - const Common::Rectangle<u32>& dst_rect); - - void SignalPreDrawCall(); - void SignalPostDrawCall(); - -protected: - void FlushObjectInner(const Surface& object) override { - object->FlushGLBuffer(temporal_memory); - } - -private: - void LoadSurface(const Surface& surface); - Surface GetSurface(const SurfaceParams& params, bool preserve_contents = true); - - /// Gets an uncached surface, creating it if need be - Surface GetUncachedSurface(const SurfaceParams& params); - - /// Recreates a surface with new parameters - Surface RecreateSurface(const Surface& old_surface, const SurfaceParams& new_params); - - /// Reserves a unique surface that can be reused later - void ReserveSurface(const Surface& surface); - - /// Tries to get a reserved surface for the specified parameters - Surface TryGetReservedSurface(const SurfaceParams& params); - - // Partialy reinterpret a surface based on a triggering_surface that collides with it. - // returns true if the reinterpret was successful, false in case it was not. - bool PartialReinterpretSurface(Surface triggering_surface, Surface intersect); - - /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data - void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface); - void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface); - void FastCopySurface(const Surface& src_surface, const Surface& dst_surface); - void CopySurface(const Surface& src_surface, const Surface& dst_surface, - const GLuint copy_pbo_handle, const GLenum src_attachment = 0, - const GLenum dst_attachment = 0, const std::size_t cubemap_face = 0); - - /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have - /// previously been used. This is to prevent surfaces from being constantly created and - /// destroyed when used with different surface parameters. - std::unordered_map<SurfaceReserveKey, Surface> surface_reserve; - - OGLFramebuffer read_framebuffer; - OGLFramebuffer draw_framebuffer; - - bool texception = false; - - /// Use a Pixel Buffer Object to download the previous texture and then upload it to the new one - /// using the new format. - OGLBuffer copy_pbo; - - std::array<Surface, Maxwell::NumRenderTargets> last_color_buffers; - std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers; - Surface last_depth_buffer; - - RasterizerTemporaryMemory temporal_memory; - - using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>; - using SurfaceInterval = typename SurfaceIntervalCache::interval_type; - - static auto GetReinterpretInterval(const Surface& object) { - return SurfaceInterval::right_open(object->GetCacheAddr() + 1, - object->GetCacheAddr() + object->GetMemorySize() - 1); - } - - // Reinterpreted surfaces are very fragil as the game may keep rendering into them. - SurfaceIntervalCache reinterpreted_surfaces; - - void RegisterReinterpretSurface(Surface reinterpret_surface) { - auto interval = GetReinterpretInterval(reinterpret_surface); - reinterpreted_surfaces.insert({interval, reinterpret_surface}); - reinterpret_surface->MarkReinterpreted(); - } - - Surface CollideOnReinterpretedSurface(CacheAddr addr) const { - const SurfaceInterval interval{addr}; - for (auto& pair : - boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) { - return pair.second; - } - return nullptr; - } - - void Register(const Surface& object) override { - RasterizerCache<Surface>::Register(object); - } - - /// Unregisters an object from the cache - void Unregister(const Surface& object) override { - if (object->IsReinterpreted()) { - auto interval = GetReinterpretInterval(object); - reinterpreted_surfaces.erase(interval); - } - RasterizerCache<Surface>::Unregister(object); - } -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index bfe666a73..5c96c1d46 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -33,6 +33,24 @@ void OGLTexture::Release() { handle = 0; } +void OGLTextureView::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenTextures(1, &handle); +} + +void OGLTextureView::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteTextures(1, &handle); + OpenGLState::GetCurState().UnbindTexture(handle).Apply(); + handle = 0; +} + void OGLSampler::Create() { if (handle != 0) return; @@ -130,6 +148,12 @@ void OGLBuffer::Release() { handle = 0; } +void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) { + ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; }); + + glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY); +} + void OGLSync::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index fbb93ee49..3a85a1d4c 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -36,6 +36,31 @@ public: GLuint handle = 0; }; +class OGLTextureView : private NonCopyable { +public: + OGLTextureView() = default; + + OGLTextureView(OGLTextureView&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLTextureView() { + Release(); + } + + OGLTextureView& operator=(OGLTextureView&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Creates a new internal OpenGL resource and stores the handle + void Create(); + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + class OGLSampler : private NonCopyable { public: OGLSampler() = default; @@ -161,6 +186,9 @@ public: /// Deletes the internal OpenGL resource void Release(); + // Converts the buffer into a stream copy buffer with a fixed size + void MakeStreamCopy(std::size_t buffer_size); + GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h index defbc2d81..34ee37f00 100644 --- a/src/video_core/renderer_opengl/gl_sampler_cache.h +++ b/src/video_core/renderer_opengl/gl_sampler_cache.h @@ -17,9 +17,9 @@ public: ~SamplerCacheOpenGL(); protected: - OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const; + OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - GLuint ToSamplerType(const OGLSampler& sampler) const; + GLuint ToSamplerType(const OGLSampler& sampler) const override; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index ac8a9e6b7..909ccb82c 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -23,13 +23,13 @@ namespace OpenGL { using VideoCommon::Shader::ProgramCode; -// One UBO is always reserved for emulation values -constexpr u32 RESERVED_UBOS = 1; +// One UBO is always reserved for emulation values on staged shaders +constexpr u32 STAGE_RESERVED_UBOS = 1; struct UnspecializedShader { std::string code; GLShader::ShaderEntries entries; - Maxwell::ShaderProgram program_type; + ProgramType program_type; }; namespace { @@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g } /// Gets the shader type from a Maxwell program type -constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) { +constexpr GLenum GetShaderType(ProgramType program_type) { switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GL_VERTEX_SHADER; - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GL_GEOMETRY_SHADER; - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GL_FRAGMENT_SHADER; + case ProgramType::Compute: + return GL_COMPUTE_SHADER; default: return GL_NONE; } @@ -100,18 +102,44 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen } } +ProgramType GetProgramType(Maxwell::ShaderProgram program) { + switch (program) { + case Maxwell::ShaderProgram::VertexA: + return ProgramType::VertexA; + case Maxwell::ShaderProgram::VertexB: + return ProgramType::VertexB; + case Maxwell::ShaderProgram::TesselationControl: + return ProgramType::TessellationControl; + case Maxwell::ShaderProgram::TesselationEval: + return ProgramType::TessellationEval; + case Maxwell::ShaderProgram::Geometry: + return ProgramType::Geometry; + case Maxwell::ShaderProgram::Fragment: + return ProgramType::Fragment; + } + UNREACHABLE(); + return {}; +} + /// Calculates the size of a program stream std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { constexpr std::size_t start_offset = 10; + // This is the encoded version of BRA that jumps to itself. All Nvidia + // shaders end with one. + constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL; + constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL; std::size_t offset = start_offset; std::size_t size = start_offset * sizeof(u64); while (offset < program.size()) { const u64 instruction = program[offset]; if (!IsSchedInstruction(offset, start_offset)) { - if (instruction == 0 || (instruction >> 52) == 0x50b) { + if ((instruction & mask) == self_jumping_branch) { // End on Maxwell's "nop" instruction break; } + if (instruction == 0) { + break; + } } size += sizeof(u64); offset++; @@ -121,11 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { } /// Hashes one (or two) program streams -u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code, - const ProgramCode& code_b) { - u64 unique_identifier = - Common::CityHash64(reinterpret_cast<const char*>(code.data()), CalculateProgramSize(code)); - if (program_type != Maxwell::ShaderProgram::VertexA) { +u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code, + const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) { + if (size_a == 0) { + size_a = CalculateProgramSize(code); + } + u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a); + if (program_type != ProgramType::VertexA) { return unique_identifier; } // VertexA programs include two programs @@ -133,46 +163,69 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& std::size_t seed = 0; boost::hash_combine(seed, unique_identifier); - const u64 identifier_b = Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), - CalculateProgramSize(code_b)); + if (size_b == 0) { + size_b = CalculateProgramSize(code_b); + } + const u64 identifier_b = + Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b); boost::hash_combine(seed, identifier_b); return static_cast<u64>(seed); } /// Creates an unspecialized program from code streams -GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type, +GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type, ProgramCode program_code, ProgramCode program_code_b) { GLShader::ShaderSetup setup(program_code); - if (program_type == Maxwell::ShaderProgram::VertexA) { + setup.program.size_a = CalculateProgramSize(program_code); + setup.program.size_b = 0; + if (program_type == ProgramType::VertexA) { // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. // Conventional HW does not support this, so we combine VertexA and VertexB into one // stage here. setup.SetProgramB(program_code_b); + setup.program.size_b = CalculateProgramSize(program_code_b); } - setup.program.unique_identifier = - GetUniqueIdentifier(program_type, program_code, program_code_b); + setup.program.unique_identifier = GetUniqueIdentifier( + program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b); switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: + case ProgramType::VertexA: + case ProgramType::VertexB: return GLShader::GenerateVertexShader(device, setup); - case Maxwell::ShaderProgram::Geometry: + case ProgramType::Geometry: return GLShader::GenerateGeometryShader(device, setup); - case Maxwell::ShaderProgram::Fragment: + case ProgramType::Fragment: return GLShader::GenerateFragmentShader(device, setup); + case ProgramType::Compute: + return GLShader::GenerateComputeShader(device, setup); default: - LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type)); return {}; } } CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries, - Maxwell::ShaderProgram program_type, BaseBindings base_bindings, - GLenum primitive_mode, bool hint_retrievable = false) { + ProgramType program_type, const ProgramVariant& variant, + bool hint_retrievable = false) { + auto base_bindings{variant.base_bindings}; + const auto primitive_mode{variant.primitive_mode}; + const auto texture_buffer_usage{variant.texture_buffer_usage}; + std::string source = "#version 430 core\n" - "#extension GL_ARB_separate_shader_objects : enable\n\n"; - source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + "#extension GL_ARB_separate_shader_objects : enable\n" + "#extension GL_NV_gpu_shader5 : enable\n" + "#extension GL_NV_shader_thread_group : enable\n"; + if (entries.shader_viewport_layer_array) { + source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; + } + if (program_type == ProgramType::Compute) { + source += "#extension GL_ARB_compute_variable_group_size : require\n"; + } + source += '\n'; + + if (program_type != ProgramType::Compute) { + source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); + } for (const auto& cbuf : entries.const_buffers) { source += @@ -186,15 +239,34 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn source += fmt::format("#define SAMPLER_BINDING_{} {}\n", sampler.GetIndex(), base_bindings.sampler++); } + for (const auto& image : entries.images) { + source += + fmt::format("#define IMAGE_BINDING_{} {}\n", image.GetIndex(), base_bindings.image++); + } + + // Transform 1D textures to texture samplers by declaring its preprocessor macros. + for (std::size_t i = 0; i < texture_buffer_usage.size(); ++i) { + if (!texture_buffer_usage.test(i)) { + continue; + } + source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i); + } + if (texture_buffer_usage.any()) { + source += '\n'; + } - if (program_type == Maxwell::ShaderProgram::Geometry) { + if (program_type == ProgramType::Geometry) { const auto [glsl_topology, debug_name, max_vertices] = GetPrimitiveDescription(primitive_mode); - source += "layout (" + std::string(glsl_topology) + ") in;\n"; + source += "layout (" + std::string(glsl_topology) + ") in;\n\n"; source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; } + if (program_type == ProgramType::Compute) { + source += "layout (local_size_variable) in;\n"; + } + source += '\n'; source += code; OGLShader shader; @@ -221,131 +293,97 @@ std::set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, - Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, - const PrecompiledPrograms& precompiled_programs, - ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) - : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr}, - unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, - precompiled_programs{precompiled_programs} { - const std::size_t code_size{CalculateProgramSize(program_code)}; - const std::size_t code_size_b{program_code_b.empty() ? 0 - : CalculateProgramSize(program_code_b)}; - GLShader::ProgramResult program_result{ - CreateProgram(device, program_type, program_code, program_code_b)}; - if (program_result.first.empty()) { +CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type, + GLShader::ProgramResult result) + : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr}, + unique_identifier{params.unique_identifier}, program_type{program_type}, + disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs}, + entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {} + +Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode&& program_code, + ProgramCode&& program_code_b) { + const auto code_size{CalculateProgramSize(program_code)}; + const auto code_size_b{CalculateProgramSize(program_code_b)}; + auto result{ + CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)}; + if (result.first.empty()) { // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now - return; + return {}; } - code = program_result.first; - entries = program_result.second; - shader_length = entries.shader_length; + params.disk_cache.SaveRaw(ShaderDiskCacheRaw( + params.unique_identifier, GetProgramType(program_type), + static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)), + std::move(program_code), std::move(program_code_b))); - const ShaderDiskCacheRaw raw(unique_identifier, program_type, - static_cast<u32>(code_size / sizeof(u64)), - static_cast<u32>(code_size_b / sizeof(u64)), - std::move(program_code), std::move(program_code_b)); - disk_cache.SaveRaw(raw); + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); } -CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, - Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, - const PrecompiledPrograms& precompiled_programs, - GLShader::ProgramResult result, u8* host_ptr) - : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, - program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{ - precompiled_programs} { - code = std::move(result.first); - entries = result.second; - shader_length = entries.shader_length; +Shader CachedShader::CreateStageFromCache(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + GLShader::ProgramResult result) { + return std::shared_ptr<CachedShader>( + new CachedShader(params, GetProgramType(program_type), std::move(result))); } -std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive_mode, - BaseBindings base_bindings) { - GLuint handle{}; - if (program_type == Maxwell::ShaderProgram::Geometry) { - handle = GetGeometryShader(primitive_mode, base_bindings); - } else { - const auto [entry, is_cache_miss] = programs.try_emplace(base_bindings); - auto& program = entry->second; - if (is_cache_miss) { - program = TryLoadProgram(primitive_mode, base_bindings); - if (!program) { - program = - SpecializeShader(code, entries, program_type, base_bindings, primitive_mode); - disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings)); - } - - LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); - } +Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) { + auto result{CreateProgram(params.device, ProgramType::Compute, code, {})}; - handle = program->handle; - } + const auto code_size{CalculateProgramSize(code)}; + params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute, + static_cast<u32>(code_size / sizeof(u64)), 0, + std::move(code), {})); - base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS; - base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size()); - base_bindings.sampler += static_cast<u32>(entries.samplers.size()); + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); +} - return {handle, base_bindings}; +Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result) { + return std::shared_ptr<CachedShader>( + new CachedShader(params, ProgramType::Compute, std::move(result))); } -GLuint CachedShader::GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings) { - const auto [entry, is_cache_miss] = geometry_programs.try_emplace(base_bindings); - auto& programs = entry->second; +std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) { + const auto [entry, is_cache_miss] = programs.try_emplace(variant); + auto& program = entry->second; + if (is_cache_miss) { + program = TryLoadProgram(variant); + if (!program) { + program = SpecializeShader(code, entries, program_type, variant); + disk_cache.SaveUsage(GetUsage(variant)); + } - switch (primitive_mode) { - case GL_POINTS: - return LazyGeometryProgram(programs.points, base_bindings, primitive_mode); - case GL_LINES: - case GL_LINE_STRIP: - return LazyGeometryProgram(programs.lines, base_bindings, primitive_mode); - case GL_LINES_ADJACENCY: - case GL_LINE_STRIP_ADJACENCY: - return LazyGeometryProgram(programs.lines_adjacency, base_bindings, primitive_mode); - case GL_TRIANGLES: - case GL_TRIANGLE_STRIP: - case GL_TRIANGLE_FAN: - return LazyGeometryProgram(programs.triangles, base_bindings, primitive_mode); - case GL_TRIANGLES_ADJACENCY: - case GL_TRIANGLE_STRIP_ADJACENCY: - return LazyGeometryProgram(programs.triangles_adjacency, base_bindings, primitive_mode); - default: - UNREACHABLE_MSG("Unknown primitive mode."); - return LazyGeometryProgram(programs.points, base_bindings, primitive_mode); + LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); } -} -GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBindings base_bindings, - GLenum primitive_mode) { - if (target_program) { - return target_program->handle; - } - const auto [glsl_name, debug_name, vertices] = GetPrimitiveDescription(primitive_mode); - target_program = TryLoadProgram(primitive_mode, base_bindings); - if (!target_program) { - target_program = - SpecializeShader(code, entries, program_type, base_bindings, primitive_mode); - disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings)); + auto base_bindings = variant.base_bindings; + base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()); + if (program_type != ProgramType::Compute) { + base_bindings.cbuf += STAGE_RESERVED_UBOS; } + base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size()); + base_bindings.sampler += static_cast<u32>(entries.samplers.size()); - LabelGLObject(GL_PROGRAM, target_program->handle, cpu_addr, debug_name); - - return target_program->handle; -}; + return {program->handle, base_bindings}; +} -CachedProgram CachedShader::TryLoadProgram(GLenum primitive_mode, - BaseBindings base_bindings) const { - const auto found = precompiled_programs.find(GetUsage(primitive_mode, base_bindings)); +CachedProgram CachedShader::TryLoadProgram(const ProgramVariant& variant) const { + const auto found = precompiled_programs.find(GetUsage(variant)); if (found == precompiled_programs.end()) { return {}; } return found->second; } -ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode, - BaseBindings base_bindings) const { - return {unique_identifier, base_bindings, primitive_mode}; +ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant) const { + ShaderDiskCacheUsage usage; + usage.unique_identifier = unique_identifier; + usage.variant = variant; + return usage; } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, @@ -411,8 +449,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } if (!shader) { shader = SpecializeShader(unspecialized.code, unspecialized.entries, - unspecialized.program_type, usage.bindings, - usage.primitive, true); + unspecialized.program_type, usage.variant, true); } std::scoped_lock lock(mutex); @@ -547,7 +584,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty_flags.shaders) { + if (!system.GPU().Maxwell3D().dirty.shaders) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -564,28 +601,55 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // No shader found - create a new one ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; ProgramCode program_code_b; - if (program == Maxwell::ShaderProgram::VertexA) { + const bool is_program_a{program == Maxwell::ShaderProgram::VertexA}; + if (is_program_a) { const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; program_code_b = GetShaderCode(memory_manager, program_addr_b, memory_manager.GetPointer(program_addr_b)); } - const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); - const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; + const auto unique_identifier = + GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b); + const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; + const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, + host_ptr, unique_identifier}; + const auto found = precompiled_shaders.find(unique_identifier); - if (found != precompiled_shaders.end()) { - // Create a shader from the cache - shader = std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache, - precompiled_programs, found->second, host_ptr); + if (found == precompiled_shaders.end()) { + shader = CachedShader::CreateStageFromMemory(params, program, std::move(program_code), + std::move(program_code_b)); } else { - // Create a shader from guest memory - shader = std::make_shared<CachedShader>( - device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, - std::move(program_code), std::move(program_code_b), host_ptr); + shader = CachedShader::CreateStageFromCache(params, program, found->second); } Register(shader); return last_shaders[static_cast<std::size_t>(program)] = shader; } +Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { + auto& memory_manager{system.GPU().MemoryManager()}; + const auto host_ptr{memory_manager.GetPointer(code_addr)}; + auto kernel = TryGet(host_ptr); + if (kernel) { + return kernel; + } + + // No kernel found - create a new one + auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; + const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})}; + const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; + const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr, + host_ptr, unique_identifier}; + + const auto found = precompiled_shaders.find(unique_identifier); + if (found == precompiled_shaders.end()) { + kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + } else { + kernel = CachedShader::CreateKernelFromCache(params, found->second); + } + + Register(kernel); + return kernel; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 09bd0761d..de195cc5d 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -6,6 +6,7 @@ #include <array> #include <atomic> +#include <bitset> #include <memory> #include <set> #include <tuple> @@ -41,17 +42,29 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>; using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>; +struct ShaderParameters { + ShaderDiskCacheOpenGL& disk_cache; + const PrecompiledPrograms& precompiled_programs; + const Device& device; + VAddr cpu_addr; + u8* host_ptr; + u64 unique_identifier; +}; + class CachedShader final : public RasterizerCacheObject { public: - explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, - Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, - const PrecompiledPrograms& precompiled_programs, - ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr); + static Shader CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode&& program_code, ProgramCode&& program_code_b); + + static Shader CreateStageFromCache(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + GLShader::ProgramResult result); + + static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code); - explicit CachedShader(VAddr cpu_addr, u64 unique_identifier, - Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, - const PrecompiledPrograms& precompiled_programs, - GLShader::ProgramResult result, u8* host_ptr); + static Shader CreateKernelFromCache(const ShaderParameters& params, + GLShader::ProgramResult result); VAddr GetCpuAddr() const override { return cpu_addr; @@ -67,49 +80,27 @@ public: } /// Gets the GL program handle for the shader - std::tuple<GLuint, BaseBindings> GetProgramHandle(GLenum primitive_mode, - BaseBindings base_bindings); + std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant); private: - // Geometry programs. These are needed because GLSL needs an input topology but it's not - // declared by the hardware. Workaround this issue by generating a different shader per input - // topology class. - struct GeometryPrograms { - CachedProgram points; - CachedProgram lines; - CachedProgram lines_adjacency; - CachedProgram triangles; - CachedProgram triangles_adjacency; - }; + explicit CachedShader(const ShaderParameters& params, ProgramType program_type, + GLShader::ProgramResult result); - GLuint GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings); + CachedProgram TryLoadProgram(const ProgramVariant& variant) const; - /// Generates a geometry shader or returns one that already exists. - GLuint LazyGeometryProgram(CachedProgram& target_program, BaseBindings base_bindings, - GLenum primitive_mode); + ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const; - CachedProgram TryLoadProgram(GLenum primitive_mode, BaseBindings base_bindings) const; - - ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const; - - u8* host_ptr{}; VAddr cpu_addr{}; u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; ShaderDiskCacheOpenGL& disk_cache; const PrecompiledPrograms& precompiled_programs; - std::size_t shader_length{}; GLShader::ShaderEntries entries; - std::string code; + std::size_t shader_length{}; - std::unordered_map<BaseBindings, CachedProgram> programs; - std::unordered_map<BaseBindings, GeometryPrograms> geometry_programs; - - std::unordered_map<u32, GLuint> cbuf_resource_cache; - std::unordered_map<u32, GLuint> gmem_resource_cache; - std::unordered_map<u32, GLint> uniform_cache; + std::unordered_map<ProgramVariant, CachedProgram> programs; }; class ShaderCacheOpenGL final : public RasterizerCache<Shader> { @@ -124,6 +115,9 @@ public: /// Gets the current specified shader stage program Shader GetStageProgram(Maxwell::ShaderProgram program); + /// Gets a compute kernel in the passed address + Shader GetComputeKernel(GPUVAddr code_addr); + protected: // We do not have to flush this cache as things in it are never modified by us. void FlushObjectInner(const Shader& object) override {} diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 7dc2e0560..137b23740 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -14,6 +14,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_rasterizer.h" @@ -36,19 +37,18 @@ using namespace std::string_literals; using namespace VideoCommon::Shader; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage; using Operation = const OperationNode&; -enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; +enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; struct TextureAoffi {}; using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureAoffi, TextureArgument>; constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); + static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); -class ShaderWriter { +class ShaderWriter final { public: void AddExpression(std::string_view text) { DEBUG_ASSERT(scope >= 0); @@ -93,9 +93,157 @@ private: u32 temporary_index = 1; }; +class Expression final { +public: + Expression(std::string code, Type type) : code{std::move(code)}, type{type} { + ASSERT(type != Type::Void); + } + Expression() : type{Type::Void} {} + + Type GetType() const { + return type; + } + + std::string GetCode() const { + return code; + } + + void CheckVoid() const { + ASSERT(type == Type::Void); + } + + std::string As(Type type) const { + switch (type) { + case Type::Bool: + return AsBool(); + case Type::Bool2: + return AsBool2(); + case Type::Float: + return AsFloat(); + case Type::Int: + return AsInt(); + case Type::Uint: + return AsUint(); + case Type::HalfFloat: + return AsHalfFloat(); + default: + UNREACHABLE_MSG("Invalid type"); + return code; + } + } + + std::string AsBool() const { + switch (type) { + case Type::Bool: + return code; + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + + std::string AsBool2() const { + switch (type) { + case Type::Bool2: + return code; + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + + std::string AsFloat() const { + switch (type) { + case Type::Float: + return code; + case Type::Uint: + return fmt::format("utof({})", code); + case Type::Int: + return fmt::format("itof({})", code); + case Type::HalfFloat: + return fmt::format("utof(packHalf2x16({}))", code); + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + + std::string AsInt() const { + switch (type) { + case Type::Float: + return fmt::format("ftoi({})", code); + case Type::Uint: + return fmt::format("int({})", code); + case Type::Int: + return code; + case Type::HalfFloat: + return fmt::format("int(packHalf2x16({}))", code); + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + + std::string AsUint() const { + switch (type) { + case Type::Float: + return fmt::format("ftou({})", code); + case Type::Uint: + return code; + case Type::Int: + return fmt::format("uint({})", code); + case Type::HalfFloat: + return fmt::format("packHalf2x16({})", code); + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + + std::string AsHalfFloat() const { + switch (type) { + case Type::Float: + return fmt::format("unpackHalf2x16(ftou({}))", code); + case Type::Uint: + return fmt::format("unpackHalf2x16({})", code); + case Type::Int: + return fmt::format("unpackHalf2x16(int({}))", code); + case Type::HalfFloat: + return code; + default: + UNREACHABLE_MSG("Incompatible types"); + return code; + } + } + +private: + std::string code; + Type type{}; +}; + +constexpr const char* GetTypeString(Type type) { + switch (type) { + case Type::Bool: + return "bool"; + case Type::Bool2: + return "bvec2"; + case Type::Float: + return "float"; + case Type::Int: + return "int"; + case Type::Uint: + return "uint"; + case Type::HalfFloat: + return "vec2"; + default: + UNREACHABLE_MSG("Invalid type"); + return "<invalid type>"; + } +} + /// Generates code to use for a swizzle operation. constexpr const char* GetSwizzle(u32 element) { - constexpr std::array<const char*, 4> swizzle = {".x", ".y", ".z", ".w"}; + constexpr std::array swizzle = {".x", ".y", ".z", ".w"}; return swizzle.at(element); } @@ -134,8 +282,8 @@ constexpr bool IsGenericAttribute(Attribute::Index index) { return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; } -constexpr Attribute::Index ToGenericAttribute(u32 value) { - return static_cast<Attribute::Index>(value + static_cast<u32>(Attribute::Index::Attribute_0)); +constexpr Attribute::Index ToGenericAttribute(u64 value) { + return static_cast<Attribute::Index>(value + static_cast<u64>(Attribute::Index::Attribute_0)); } u32 GetGenericAttributeIndex(Attribute::Index index) { @@ -161,9 +309,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +constexpr bool IsVertexShader(ProgramType stage) { + return stage == ProgramType::VertexA || stage == ProgramType::VertexB; +} + class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage, std::string suffix) : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} @@ -180,20 +332,23 @@ public: DeclareGlobalMemory(); DeclareSamplers(); DeclarePhysicalAttributeReader(); + DeclareImages(); code.AddLine("void execute_{}() {{", suffix); ++code.scope; // VM's program counter const auto first_address = ir.GetBasicBlocks().begin()->first; - code.AddLine("uint jmp_to = {}u;", first_address); + code.AddLine("uint jmp_to = {}U;", first_address); // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems // unlikely that shaders will use 20 nested SSYs and PBKs. - constexpr u32 FLOW_STACK_SIZE = 20; - for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { - code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); - code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + if (!ir.IsFlowStackDisabled()) { + constexpr u32 FLOW_STACK_SIZE = 20; + for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { + code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); + code.AddLine("uint {} = 0U;", FlowStackTopName(stack)); + } } code.AddLine("while (true) {{"); @@ -203,7 +358,7 @@ public: for (const auto& pair : ir.GetBasicBlocks()) { const auto [address, bb] = pair; - code.AddLine("case 0x{:x}u: {{", address); + code.AddLine("case 0x{:X}U: {{", address); ++code.scope; VisitBlock(bb); @@ -234,30 +389,30 @@ public: for (const auto& sampler : ir.GetSamplers()) { entries.samplers.emplace_back(sampler); } - for (const auto& gmem_pair : ir.GetGlobalMemory()) { - const auto& [base, usage] = gmem_pair; + for (const auto& [offset, image] : ir.GetImages()) { + entries.images.emplace_back(image); + } + for (const auto& [base, usage] : ir.GetGlobalMemory()) { entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_read, usage.is_written); } entries.clip_distances = ir.GetClipDistances(); + entries.shader_viewport_layer_array = + IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex()); entries.shader_length = ir.GetLength(); return entries; } private: - using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation); - using OperationDecompilersArray = - std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; - void DeclareVertex() { - if (stage != ShaderStage::Vertex) + if (!IsVertexShader(stage)) return; DeclareVertexRedeclarations(); } void DeclareGeometry() { - if (stage != ShaderStage::Geometry) { + if (stage != ProgramType::Geometry) { return; } @@ -276,21 +431,34 @@ private: } void DeclareVertexRedeclarations() { - bool clip_distances_declared = false; - code.AddLine("out gl_PerVertex {{"); ++code.scope; code.AddLine("vec4 gl_Position;"); - for (const auto o : ir.GetOutputAttributes()) { - if (o == Attribute::Index::PointSize) - code.AddLine("float gl_PointSize;"); - if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 || - o == Attribute::Index::ClipDistances4567)) { + for (const auto attribute : ir.GetOutputAttributes()) { + if (attribute == Attribute::Index::ClipDistances0123 || + attribute == Attribute::Index::ClipDistances4567) { code.AddLine("float gl_ClipDistance[];"); - clip_distances_declared = true; + break; + } + } + if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { + if (ir.UsesLayer()) { + code.AddLine("int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("int gl_ViewportIndex;"); } + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && + !device.HasVertexViewportLayer()) { + LOG_ERROR( + Render_OpenGL, + "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); + } + + if (ir.UsesPointSize()) { + code.AddLine("float gl_PointSize;"); } --code.scope; @@ -301,7 +469,7 @@ private: void DeclareRegisters() { const auto& registers = ir.GetRegisters(); for (const u32 gpr : registers) { - code.AddLine("float {} = 0;", GetRegister(gpr)); + code.AddLine("float {} = 0.0f;", GetRegister(gpr)); } if (!registers.empty()) { code.AddNewLine(); @@ -319,11 +487,16 @@ private: } void DeclareLocalMemory() { - if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) { - const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; - code.AddLine("float {}[{}];", GetLocalMemory(), element_count); - code.AddNewLine(); + // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at + // specialization time. + const u64 local_memory_size = + stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize(); + if (local_memory_size == 0) { + return; } + const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; + code.AddLine("uint {}[{}];", GetLocalMemory(), element_count); + code.AddNewLine(); } void DeclareInternalFlags() { @@ -345,8 +518,6 @@ private: return "noperspective "; default: case AttributeUse::Unused: - UNREACHABLE_MSG("Unused attribute being fetched"); - return {}; UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<u32>(attribute)); return {}; } @@ -377,12 +548,12 @@ private: const u32 location{GetGenericAttributeIndex(index)}; std::string name{GetInputAttribute(index)}; - if (stage == ShaderStage::Geometry) { + if (stage == ProgramType::Geometry) { name = "gs_" + name + "[]"; } std::string suffix; - if (stage == ShaderStage::Fragment) { + if (stage == ProgramType::Fragment) { const auto input_mode{header.ps.GetAttributeUse(location)}; if (skip_unused && input_mode == AttributeUse::Unused) { return; @@ -394,7 +565,7 @@ private: } void DeclareOutputAttributes() { - if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) { + if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) { for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) { DeclareOutputAttribute(ToGenericAttribute(i)); } @@ -423,7 +594,7 @@ private: const auto [index, size] = entry; code.AddLine("layout (std140, binding = CBUF_BINDING_{}) uniform {} {{", index, GetConstBufferBlock(index)); - code.AddLine(" vec4 {}[MAX_CONSTBUFFER_ELEMENTS];", GetConstBuffer(index)); + code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); code.AddLine("}};"); code.AddNewLine(); } @@ -444,7 +615,7 @@ private: code.AddLine("layout (std430, binding = GMEM_BINDING_{}_{}) {} buffer {} {{", base.cbuf_index, base.cbuf_offset, qualifier, GetGlobalMemoryBlock(base)); - code.AddLine(" float {}[];", GetGlobalMemory(base)); + code.AddLine(" uint {}[];", GetGlobalMemory(base)); code.AddLine("}};"); code.AddNewLine(); } @@ -453,9 +624,13 @@ private: void DeclareSamplers() { const auto& samplers = ir.GetSamplers(); for (const auto& sampler : samplers) { - std::string sampler_type = [&sampler] { + const std::string name{GetSampler(sampler)}; + const std::string description{"layout (binding = SAMPLER_BINDING_" + + std::to_string(sampler.GetIndex()) + ") uniform"}; + std::string sampler_type = [&]() { switch (sampler.GetType()) { case Tegra::Shader::TextureType::Texture1D: + // Special cased, read below. return "sampler1D"; case Tegra::Shader::TextureType::Texture2D: return "sampler2D"; @@ -475,8 +650,19 @@ private: sampler_type += "Shadow"; } - code.AddLine("layout (binding = SAMPLER_BINDING_{}) uniform {} {};", sampler.GetIndex(), - sampler_type, GetSampler(sampler)); + if (sampler.GetType() == Tegra::Shader::TextureType::Texture1D) { + // 1D textures can be aliased to texture buffers, hide the declarations behind a + // preprocessor flag and use one or the other from the GPU state. This has to be + // done because shaders don't have enough information to determine the texture type. + EmitIfdefIsBuffer(sampler); + code.AddLine("{} samplerBuffer {};", description, name); + code.AddLine("#else"); + code.AddLine("{} {} {};", description, sampler_type, name); + code.AddLine("#endif"); + } else { + // The other texture types (2D, 3D and cubes) don't have this issue. + code.AddLine("{} {} {};", description, sampler_type, name); + } } if (!samplers.empty()) { code.AddNewLine(); @@ -487,7 +673,7 @@ private: if (!ir.HasPhysicalAttributes()) { return; } - code.AddLine("float readPhysicalAttribute(uint physical_address) {{"); + code.AddLine("float ReadPhysicalAttribute(uint physical_address) {{"); ++code.scope; code.AddLine("switch (physical_address) {{"); @@ -496,15 +682,16 @@ private: for (u32 index = 0; index < num_attributes; ++index) { const auto attribute{ToGenericAttribute(index)}; for (u32 element = 0; element < 4; ++element) { - constexpr u32 generic_base{0x80}; - constexpr u32 generic_stride{16}; - constexpr u32 element_stride{4}; + constexpr u32 generic_base = 0x80; + constexpr u32 generic_stride = 16; + constexpr u32 element_stride = 4; const u32 address{generic_base + index * generic_stride + element * element_stride}; - const bool declared{stage != ShaderStage::Fragment || - header.ps.GetAttributeUse(index) != AttributeUse::Unused}; - const std::string value{declared ? ReadAttribute(attribute, element) : "0"}; - code.AddLine("case 0x{:x}: return {};", address, value); + const bool declared = stage != ProgramType::Fragment || + header.ps.GetAttributeUse(index) != AttributeUse::Unused; + const std::string value = + declared ? ReadAttribute(attribute, element).AsFloat() : "0.0f"; + code.AddLine("case 0x{:X}U: return {};", address, value); } } @@ -516,15 +703,68 @@ private: code.AddNewLine(); } + void DeclareImages() { + const auto& images{ir.GetImages()}; + for (const auto& [offset, image] : images) { + const char* image_type = [&] { + switch (image.GetType()) { + case Tegra::Shader::ImageType::Texture1D: + return "image1D"; + case Tegra::Shader::ImageType::TextureBuffer: + return "imageBuffer"; + case Tegra::Shader::ImageType::Texture1DArray: + return "image1DArray"; + case Tegra::Shader::ImageType::Texture2D: + return "image2D"; + case Tegra::Shader::ImageType::Texture2DArray: + return "image2DArray"; + case Tegra::Shader::ImageType::Texture3D: + return "image3D"; + default: + UNREACHABLE(); + return "image1D"; + } + }(); + + const auto [type_prefix, format] = [&]() -> std::pair<const char*, const char*> { + if (!image.IsSizeKnown()) { + return {"", ""}; + } + switch (image.GetSize()) { + case Tegra::Shader::ImageAtomicSize::U32: + return {"u", "r32ui, "}; + case Tegra::Shader::ImageAtomicSize::S32: + return {"i", "r32i, "}; + default: + UNIMPLEMENTED_MSG("Unimplemented atomic size={}", + static_cast<u32>(image.GetSize())); + return {"", ""}; + } + }(); + + std::string qualifier = "coherent volatile"; + if (image.IsRead() && !image.IsWritten()) { + qualifier += " readonly"; + } else if (image.IsWritten() && !image.IsRead()) { + qualifier += " writeonly"; + } + + code.AddLine("layout (binding = IMAGE_BINDING_{}) {} uniform " + "{} {};", + image.GetIndex(), qualifier, image_type, GetImage(image)); + } + if (!images.empty()) { + code.AddNewLine(); + } + } + void VisitBlock(const NodeBlock& bb) { for (const auto& node : bb) { - if (const std::string expr = Visit(node); !expr.empty()) { - code.AddLine(expr); - } + Visit(node).CheckVoid(); } } - std::string Visit(const Node& node) { + Expression Visit(const Node& node) { if (const auto operation = std::get_if<OperationNode>(&*node)) { const auto operation_index = static_cast<std::size_t>(operation->GetCode()); if (operation_index >= operation_decompilers.size()) { @@ -542,18 +782,18 @@ private: if (const auto gpr = std::get_if<GprNode>(&*node)) { const u32 index = gpr->GetIndex(); if (index == Register::ZeroIndex) { - return "0"; + return {"0U", Type::Uint}; } - return GetRegister(index); + return {GetRegister(index), Type::Float}; } if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { const u32 value = immediate->GetValue(); if (value < 10) { // For eyecandy avoid using hex numbers on single digits - return fmt::format("utof({}u)", immediate->GetValue()); + return {fmt::format("{}U", immediate->GetValue()), Type::Uint}; } - return fmt::format("utof(0x{:x}u)", immediate->GetValue()); + return {fmt::format("0x{:X}U", immediate->GetValue()), Type::Uint}; } if (const auto predicate = std::get_if<PredicateNode>(&*node)) { @@ -568,17 +808,18 @@ private: } }(); if (predicate->IsNegated()) { - return fmt::format("!({})", value); + return {fmt::format("!({})", value), Type::Bool}; } - return value; + return {value, Type::Bool}; } if (const auto abuf = std::get_if<AbufNode>(&*node)) { - UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry, + UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry, "Physical attributes in geometry shaders are not implemented"); if (abuf->IsPhysicalBuffer()) { - return fmt::format("readPhysicalAttribute(ftou({}))", - Visit(abuf->GetPhysicalAddress())); + return {fmt::format("ReadPhysicalAttribute({})", + Visit(abuf->GetPhysicalAddress()).AsUint()), + Type::Float}; } return ReadAttribute(abuf->GetIndex(), abuf->GetElement(), abuf->GetBuffer()); } @@ -589,56 +830,64 @@ private: // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - return fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4); + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), + Type::Uint}; } if (std::holds_alternative<OperationNode>(*offset)) { // Indirect access const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = ftou({}) >> 2;", final_offset, Visit(offset)); + code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); if (!device.HasComponentIndexingBug()) { - return fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset); + return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset), + Type::Uint}; } // AMD's proprietary GLSL compiler emits ill code for variable component access. // To bypass this driver bug generate 4 ifs, one per each component. const std::string pack = code.GenerateTemporary(); - code.AddLine("vec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), final_offset); const std::string result = code.GenerateTemporary(); - code.AddLine("float {};", result); + code.AddLine("uint {};", result); for (u32 swizzle = 0; swizzle < 4; ++swizzle) { code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, GetSwizzle(swizzle)); } - return result; + return {result, Type::Uint}; } UNREACHABLE_MSG("Unmanaged offset node type"); } if (const auto gmem = std::get_if<GmemNode>(&*node)) { - const std::string real = Visit(gmem->GetRealAddress()); - const std::string base = Visit(gmem->GetBaseAddress()); - const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base); - return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset); + const std::string real = Visit(gmem->GetRealAddress()).AsUint(); + const std::string base = Visit(gmem->GetBaseAddress()).AsUint(); + const std::string final_offset = fmt::format("({} - {}) >> 2", real, base); + return {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset), + Type::Uint}; } if (const auto lmem = std::get_if<LmemNode>(&*node)) { - return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } + return { + fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()), + Type::Uint}; } if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { - return GetInternalFlag(internal_flag->GetFlag()); + return {GetInternalFlag(internal_flag->GetFlag()), Type::Bool}; } if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { // It's invalid to call conditional on nested nodes, use an operation instead - code.AddLine("if ({}) {{", Visit(conditional->GetCondition())); + code.AddLine("if ({}) {{", Visit(conditional->GetCondition()).AsBool()); ++code.scope; VisitBlock(conditional->GetCode()); @@ -649,20 +898,21 @@ private: } if (const auto comment = std::get_if<CommentNode>(&*node)) { - return "// " + comment->GetText(); + code.AddLine("// " + comment->GetText()); + return {}; } UNREACHABLE(); return {}; } - std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) { + Expression ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) { const auto GeometryPass = [&](std::string_view name) { - if (stage == ShaderStage::Geometry && buffer) { + if (stage == ProgramType::Geometry && buffer) { // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games // set an 0x80000000 index for those and the shader fails to build. Find out why // this happens and what's its intent. - return fmt::format("gs_{}[ftou({}) % MAX_VERTEX_INPUT]", name, Visit(buffer)); + return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint()); } return std::string(name); }; @@ -670,72 +920,79 @@ private: switch (attribute) { case Attribute::Index::Position: switch (stage) { - case ShaderStage::Geometry: - return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer), - GetSwizzle(element)); - case ShaderStage::Fragment: - return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)); + case ProgramType::Geometry: + return {fmt::format("gl_in[{}].gl_Position{}", Visit(buffer).AsUint(), + GetSwizzle(element)), + Type::Float}; + case ProgramType::Fragment: + return {element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)), + Type::Float}; default: UNREACHABLE(); } case Attribute::Index::PointCoord: switch (element) { case 0: - return "gl_PointCoord.x"; + return {"gl_PointCoord.x", Type::Float}; case 1: - return "gl_PointCoord.y"; + return {"gl_PointCoord.y", Type::Float}; case 2: case 3: - return "0"; + return {"0.0f", Type::Float}; } UNREACHABLE(); - return "0"; + return {"0", Type::Int}; case Attribute::Index::TessCoordInstanceIDVertexID: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. - ASSERT(stage == ShaderStage::Vertex); + ASSERT(IsVertexShader(stage)); switch (element) { case 2: // Config pack's first value is instance_id. - return "uintBitsToFloat(config_pack[0])"; + return {"config_pack[0]", Type::Uint}; case 3: - return "uintBitsToFloat(gl_VertexID)"; + return {"gl_VertexID", Type::Int}; } UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); - return "0"; + return {"0", Type::Int}; case Attribute::Index::FrontFacing: // TODO(Subv): Find out what the values are for the other elements. - ASSERT(stage == ShaderStage::Fragment); + ASSERT(stage == ProgramType::Fragment); switch (element) { case 3: - return "itof(gl_FrontFacing ? -1 : 0)"; + return {"(gl_FrontFacing ? -1 : 0)", Type::Int}; } UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element); - return "0"; + return {"0", Type::Int}; default: if (IsGenericAttribute(attribute)) { - return GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element); + return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element), + Type::Float}; } break; } UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); - return "0"; + return {"0", Type::Int}; } - std::string ApplyPrecise(Operation operation, const std::string& value) { + Expression ApplyPrecise(Operation operation, std::string value, Type type) { if (!IsPrecise(operation)) { - return value; + return {std::move(value), type}; } - // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders - const std::string precise = stage != ShaderStage::Fragment ? "precise " : ""; + // Old Nvidia drivers have a bug with precise and texture sampling. These are more likely to + // be found in fragment shaders, so we disable precise there. There are vertex shaders that + // also fail to build but nobody seems to care about those. + // Note: Only bugged drivers will skip precise. + const bool disable_precise = device.HasPreciseBug() && stage == ProgramType::Fragment; - const std::string temporary = code.GenerateTemporary(); - code.AddLine("{}float {} = {};", precise, temporary, value); - return temporary; + std::string temporary = code.GenerateTemporary(); + code.AddLine("{}{} {} = {};", disable_precise ? "" : "precise ", GetTypeString(type), + temporary, value); + return {std::move(temporary), type}; } - std::string VisitOperand(Operation operation, std::size_t operand_index) { + Expression VisitOperand(Operation operation, std::size_t operand_index) { const auto& operand = operation[operand_index]; const bool parent_precise = IsPrecise(operation); const bool child_precise = IsPrecise(operand); @@ -744,102 +1001,98 @@ private: return Visit(operand); } - const std::string temporary = code.GenerateTemporary(); - code.AddLine("float {} = {};", temporary, Visit(operand)); - return temporary; + Expression value = Visit(operand); + std::string temporary = code.GenerateTemporary(); + code.AddLine("{} {} = {};", GetTypeString(value.GetType()), temporary, value.GetCode()); + return {std::move(temporary), value.GetType()}; } - std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) { - return CastOperand(VisitOperand(operation, operand_index), type); - } - - std::string CastOperand(const std::string& value, Type type) const { - switch (type) { - case Type::Bool: - case Type::Bool2: - case Type::Float: - return value; - case Type::Int: - return fmt::format("ftoi({})", value); - case Type::Uint: - return fmt::format("ftou({})", value); - case Type::HalfFloat: - return fmt::format("toHalf2({})", value); - } - UNREACHABLE(); - return value; - } - - std::string BitwiseCastResult(const std::string& value, Type type, - bool needs_parenthesis = false) { - switch (type) { - case Type::Bool: - case Type::Bool2: - case Type::Float: - if (needs_parenthesis) { - return fmt::format("({})", value); + Expression GetOutputAttribute(const AbufNode* abuf) { + switch (const auto attribute = abuf->GetIndex()) { + case Attribute::Index::Position: + return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}; + case Attribute::Index::LayerViewportPointSize: + switch (abuf->GetElement()) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + return {}; + } + return {"gl_Layer", Type::Int}; + case 2: + if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + return {}; + } + return {"gl_ViewportIndex", Type::Int}; + case 3: + UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); + return {"gl_PointSize", Type::Float}; } - return value; - case Type::Int: - return fmt::format("itof({})", value); - case Type::Uint: - return fmt::format("utof({})", value); - case Type::HalfFloat: - return fmt::format("fromHalf2({})", value); + return {}; + case Attribute::Index::ClipDistances0123: + return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}; + case Attribute::Index::ClipDistances4567: + return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}; + default: + if (IsGenericAttribute(attribute)) { + return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), + Type::Float}; + } + UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); + return {}; } - UNREACHABLE(); - return value; } - std::string GenerateUnary(Operation operation, const std::string& func, Type result_type, - Type type_a, bool needs_parenthesis = true) { - const std::string op_str = fmt::format("{}({})", func, VisitOperand(operation, 0, type_a)); - - return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type, needs_parenthesis)); + Expression GenerateUnary(Operation operation, std::string_view func, Type result_type, + Type type_a) { + std::string op_str = fmt::format("{}({})", func, VisitOperand(operation, 0).As(type_a)); + return ApplyPrecise(operation, std::move(op_str), result_type); } - std::string GenerateBinaryInfix(Operation operation, const std::string& func, Type result_type, - Type type_a, Type type_b) { - const std::string op_a = VisitOperand(operation, 0, type_a); - const std::string op_b = VisitOperand(operation, 1, type_b); - const std::string op_str = fmt::format("({} {} {})", op_a, func, op_b); + Expression GenerateBinaryInfix(Operation operation, std::string_view func, Type result_type, + Type type_a, Type type_b) { + const std::string op_a = VisitOperand(operation, 0).As(type_a); + const std::string op_b = VisitOperand(operation, 1).As(type_b); + std::string op_str = fmt::format("({} {} {})", op_a, func, op_b); - return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type)); + return ApplyPrecise(operation, std::move(op_str), result_type); } - std::string GenerateBinaryCall(Operation operation, const std::string& func, Type result_type, - Type type_a, Type type_b) { - const std::string op_a = VisitOperand(operation, 0, type_a); - const std::string op_b = VisitOperand(operation, 1, type_b); - const std::string op_str = fmt::format("{}({}, {})", func, op_a, op_b); + Expression GenerateBinaryCall(Operation operation, std::string_view func, Type result_type, + Type type_a, Type type_b) { + const std::string op_a = VisitOperand(operation, 0).As(type_a); + const std::string op_b = VisitOperand(operation, 1).As(type_b); + std::string op_str = fmt::format("{}({}, {})", func, op_a, op_b); - return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type)); + return ApplyPrecise(operation, std::move(op_str), result_type); } - std::string GenerateTernary(Operation operation, const std::string& func, Type result_type, - Type type_a, Type type_b, Type type_c) { - const std::string op_a = VisitOperand(operation, 0, type_a); - const std::string op_b = VisitOperand(operation, 1, type_b); - const std::string op_c = VisitOperand(operation, 2, type_c); - const std::string op_str = fmt::format("{}({}, {}, {})", func, op_a, op_b, op_c); + Expression GenerateTernary(Operation operation, std::string_view func, Type result_type, + Type type_a, Type type_b, Type type_c) { + const std::string op_a = VisitOperand(operation, 0).As(type_a); + const std::string op_b = VisitOperand(operation, 1).As(type_b); + const std::string op_c = VisitOperand(operation, 2).As(type_c); + std::string op_str = fmt::format("{}({}, {}, {})", func, op_a, op_b, op_c); - return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type)); + return ApplyPrecise(operation, std::move(op_str), result_type); } - std::string GenerateQuaternary(Operation operation, const std::string& func, Type result_type, - Type type_a, Type type_b, Type type_c, Type type_d) { - const std::string op_a = VisitOperand(operation, 0, type_a); - const std::string op_b = VisitOperand(operation, 1, type_b); - const std::string op_c = VisitOperand(operation, 2, type_c); - const std::string op_d = VisitOperand(operation, 3, type_d); - const std::string op_str = fmt::format("{}({}, {}, {}, {})", func, op_a, op_b, op_c, op_d); + Expression GenerateQuaternary(Operation operation, const std::string& func, Type result_type, + Type type_a, Type type_b, Type type_c, Type type_d) { + const std::string op_a = VisitOperand(operation, 0).As(type_a); + const std::string op_b = VisitOperand(operation, 1).As(type_b); + const std::string op_c = VisitOperand(operation, 2).As(type_c); + const std::string op_d = VisitOperand(operation, 3).As(type_d); + std::string op_str = fmt::format("{}({}, {}, {}, {})", func, op_a, op_b, op_c, op_d); - return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type)); + return ApplyPrecise(operation, std::move(op_str), result_type); } std::string GenerateTexture(Operation operation, const std::string& function_suffix, const std::vector<TextureIR>& extras) { - constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"}; + constexpr std::array coord_constructors = {"float", "vec2", "vec3", "vec4"}; const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); @@ -856,17 +1109,17 @@ private: expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += Visit(operation[i]); + expr += Visit(operation[i]).AsFloat(); const std::size_t next = i + 1; if (next < count) expr += ", "; } if (has_array) { - expr += ", float(ftoi(" + Visit(meta->array) + "))"; + expr += ", float(" + Visit(meta->array).AsInt() + ')'; } if (has_shadow) { - expr += ", " + Visit(meta->depth_compare); + expr += ", " + Visit(meta->depth_compare).AsFloat(); } expr += ')'; @@ -897,11 +1150,11 @@ private: // required to be constant) expr += std::to_string(static_cast<s32>(immediate->GetValue())); } else { - expr += fmt::format("ftoi({})", Visit(operand)); + expr += Visit(operand).AsInt(); } break; case Type::Float: - expr += Visit(operand); + expr += Visit(operand).AsFloat(); break; default: { const auto type_int = static_cast<u32>(type); @@ -917,7 +1170,7 @@ private: if (aoffi.empty()) { return {}; } - constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"}; + constexpr std::array coord_constructors = {"int", "ivec2", "ivec3"}; std::string expr = ", "; expr += coord_constructors.at(aoffi.size() - 1); expr += '('; @@ -930,7 +1183,7 @@ private: expr += std::to_string(static_cast<s32>(immediate->GetValue())); } else if (device.HasVariableAoffi()) { // Avoid using variable AOFFI on unsupported devices. - expr += fmt::format("ftoi({})", Visit(operand)); + expr += Visit(operand).AsInt(); } else { // Insert 0 on devices not supporting variable AOFFI. expr += '0'; @@ -944,318 +1197,382 @@ private: return expr; } - std::string Assign(Operation operation) { + std::string BuildIntegerCoordinates(Operation operation) { + constexpr std::array constructors{"int(", "ivec2(", "ivec3(", "ivec4("}; + const std::size_t coords_count{operation.GetOperandsCount()}; + std::string expr = constructors.at(coords_count - 1); + for (std::size_t i = 0; i < coords_count; ++i) { + expr += VisitOperand(operation, i).AsInt(); + if (i + 1 < coords_count) { + expr += ", "; + } + } + expr += ')'; + return expr; + } + + std::string BuildImageValues(Operation operation) { + const auto meta{std::get<MetaImage>(operation.GetMeta())}; + const auto [constructors, type] = [&]() -> std::pair<std::array<const char*, 4>, Type> { + constexpr std::array float_constructors{"float", "vec2", "vec3", "vec4"}; + if (!meta.image.IsSizeKnown()) { + return {float_constructors, Type::Float}; + } + switch (meta.image.GetSize()) { + case Tegra::Shader::ImageAtomicSize::U32: + return {{"uint", "uvec2", "uvec3", "uvec4"}, Type::Uint}; + case Tegra::Shader::ImageAtomicSize::S32: + return {{"int", "ivec2", "ivec3", "ivec4"}, Type::Uint}; + default: + UNIMPLEMENTED_MSG("Unimplemented image size={}", + static_cast<u32>(meta.image.GetSize())); + return {float_constructors, Type::Float}; + } + }(); + + const std::size_t values_count{meta.values.size()}; + std::string expr = fmt::format("{}(", constructors.at(values_count - 1)); + for (std::size_t i = 0; i < values_count; ++i) { + expr += Visit(meta.values.at(i)).As(type); + if (i + 1 < values_count) { + expr += ", "; + } + } + expr += ')'; + return expr; + } + + Expression AtomicImage(Operation operation, const char* opname) { + constexpr std::array constructors{"int(", "ivec2(", "ivec3(", "ivec4("}; + const auto meta{std::get<MetaImage>(operation.GetMeta())}; + ASSERT(meta.values.size() == 1); + ASSERT(meta.image.IsSizeKnown()); + + const auto type = [&]() { + switch (const auto size = meta.image.GetSize()) { + case Tegra::Shader::ImageAtomicSize::U32: + return Type::Uint; + case Tegra::Shader::ImageAtomicSize::S32: + return Type::Int; + default: + UNIMPLEMENTED_MSG("Unimplemented image size={}", static_cast<u32>(size)); + return Type::Uint; + } + }(); + + return {fmt::format("{}({}, {}, {})", opname, GetImage(meta.image), + BuildIntegerCoordinates(operation), Visit(meta.values[0]).As(type)), + type}; + } + + Expression Assign(Operation operation) { const Node& dest = operation[0]; const Node& src = operation[1]; - std::string target; + Expression target; if (const auto gpr = std::get_if<GprNode>(&*dest)) { if (gpr->GetIndex() == Register::ZeroIndex) { // Writing to Register::ZeroIndex is a no op return {}; } - target = GetRegister(gpr->GetIndex()); + target = {GetRegister(gpr->GetIndex()), Type::Float}; } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer()); - - target = [&]() -> std::string { - switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) { - case Attribute::Index::Position: - return "gl_Position"s + GetSwizzle(abuf->GetElement()); - case Attribute::Index::PointSize: - return "gl_PointSize"; - case Attribute::Index::ClipDistances0123: - return fmt::format("gl_ClipDistance[{}]", abuf->GetElement()); - case Attribute::Index::ClipDistances4567: - return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4); - default: - if (IsGenericAttribute(attribute)) { - return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()); - } - UNIMPLEMENTED_MSG("Unhandled output attribute: {}", - static_cast<u32>(attribute)); - return "0"; - } - }(); + target = GetOutputAttribute(abuf); } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { - target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress())); + if (stage == ProgramType::Compute) { + LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); + } + target = { + fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()), + Type::Uint}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { - const std::string real = Visit(gmem->GetRealAddress()); - const std::string base = Visit(gmem->GetBaseAddress()); - const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base); - target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset); + const std::string real = Visit(gmem->GetRealAddress()).AsUint(); + const std::string base = Visit(gmem->GetBaseAddress()).AsUint(); + const std::string final_offset = fmt::format("({} - {}) >> 2", real, base); + target = {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset), + Type::Uint}; } else { UNREACHABLE_MSG("Assign called without a proper target"); } - code.AddLine("{} = {};", target, Visit(src)); + code.AddLine("{} = {};", target.GetCode(), Visit(src).As(target.GetType())); return {}; } template <Type type> - std::string Add(Operation operation) { + Expression Add(Operation operation) { return GenerateBinaryInfix(operation, "+", type, type, type); } template <Type type> - std::string Mul(Operation operation) { + Expression Mul(Operation operation) { return GenerateBinaryInfix(operation, "*", type, type, type); } template <Type type> - std::string Div(Operation operation) { + Expression Div(Operation operation) { return GenerateBinaryInfix(operation, "/", type, type, type); } template <Type type> - std::string Fma(Operation operation) { + Expression Fma(Operation operation) { return GenerateTernary(operation, "fma", type, type, type, type); } template <Type type> - std::string Negate(Operation operation) { - return GenerateUnary(operation, "-", type, type, true); + Expression Negate(Operation operation) { + return GenerateUnary(operation, "-", type, type); } template <Type type> - std::string Absolute(Operation operation) { - return GenerateUnary(operation, "abs", type, type, false); + Expression Absolute(Operation operation) { + return GenerateUnary(operation, "abs", type, type); } - std::string FClamp(Operation operation) { + Expression FClamp(Operation operation) { return GenerateTernary(operation, "clamp", Type::Float, Type::Float, Type::Float, Type::Float); } + Expression FCastHalf0(Operation operation) { + return {fmt::format("({})[0]", VisitOperand(operation, 0).AsHalfFloat()), Type::Float}; + } + + Expression FCastHalf1(Operation operation) { + return {fmt::format("({})[1]", VisitOperand(operation, 0).AsHalfFloat()), Type::Float}; + } + template <Type type> - std::string Min(Operation operation) { + Expression Min(Operation operation) { return GenerateBinaryCall(operation, "min", type, type, type); } template <Type type> - std::string Max(Operation operation) { + Expression Max(Operation operation) { return GenerateBinaryCall(operation, "max", type, type, type); } - std::string Select(Operation operation) { - const std::string condition = Visit(operation[0]); - const std::string true_case = Visit(operation[1]); - const std::string false_case = Visit(operation[2]); - const std::string op_str = fmt::format("({} ? {} : {})", condition, true_case, false_case); + Expression Select(Operation operation) { + const std::string condition = Visit(operation[0]).AsBool(); + const std::string true_case = Visit(operation[1]).AsUint(); + const std::string false_case = Visit(operation[2]).AsUint(); + std::string op_str = fmt::format("({} ? {} : {})", condition, true_case, false_case); - return ApplyPrecise(operation, op_str); + return ApplyPrecise(operation, std::move(op_str), Type::Uint); } - std::string FCos(Operation operation) { - return GenerateUnary(operation, "cos", Type::Float, Type::Float, false); + Expression FCos(Operation operation) { + return GenerateUnary(operation, "cos", Type::Float, Type::Float); } - std::string FSin(Operation operation) { - return GenerateUnary(operation, "sin", Type::Float, Type::Float, false); + Expression FSin(Operation operation) { + return GenerateUnary(operation, "sin", Type::Float, Type::Float); } - std::string FExp2(Operation operation) { - return GenerateUnary(operation, "exp2", Type::Float, Type::Float, false); + Expression FExp2(Operation operation) { + return GenerateUnary(operation, "exp2", Type::Float, Type::Float); } - std::string FLog2(Operation operation) { - return GenerateUnary(operation, "log2", Type::Float, Type::Float, false); + Expression FLog2(Operation operation) { + return GenerateUnary(operation, "log2", Type::Float, Type::Float); } - std::string FInverseSqrt(Operation operation) { - return GenerateUnary(operation, "inversesqrt", Type::Float, Type::Float, false); + Expression FInverseSqrt(Operation operation) { + return GenerateUnary(operation, "inversesqrt", Type::Float, Type::Float); } - std::string FSqrt(Operation operation) { - return GenerateUnary(operation, "sqrt", Type::Float, Type::Float, false); + Expression FSqrt(Operation operation) { + return GenerateUnary(operation, "sqrt", Type::Float, Type::Float); } - std::string FRoundEven(Operation operation) { - return GenerateUnary(operation, "roundEven", Type::Float, Type::Float, false); + Expression FRoundEven(Operation operation) { + return GenerateUnary(operation, "roundEven", Type::Float, Type::Float); } - std::string FFloor(Operation operation) { - return GenerateUnary(operation, "floor", Type::Float, Type::Float, false); + Expression FFloor(Operation operation) { + return GenerateUnary(operation, "floor", Type::Float, Type::Float); } - std::string FCeil(Operation operation) { - return GenerateUnary(operation, "ceil", Type::Float, Type::Float, false); + Expression FCeil(Operation operation) { + return GenerateUnary(operation, "ceil", Type::Float, Type::Float); } - std::string FTrunc(Operation operation) { - return GenerateUnary(operation, "trunc", Type::Float, Type::Float, false); + Expression FTrunc(Operation operation) { + return GenerateUnary(operation, "trunc", Type::Float, Type::Float); } template <Type type> - std::string FCastInteger(Operation operation) { - return GenerateUnary(operation, "float", Type::Float, type, false); + Expression FCastInteger(Operation operation) { + return GenerateUnary(operation, "float", Type::Float, type); } - std::string ICastFloat(Operation operation) { - return GenerateUnary(operation, "int", Type::Int, Type::Float, false); + Expression ICastFloat(Operation operation) { + return GenerateUnary(operation, "int", Type::Int, Type::Float); } - std::string ICastUnsigned(Operation operation) { - return GenerateUnary(operation, "int", Type::Int, Type::Uint, false); + Expression ICastUnsigned(Operation operation) { + return GenerateUnary(operation, "int", Type::Int, Type::Uint); } template <Type type> - std::string LogicalShiftLeft(Operation operation) { + Expression LogicalShiftLeft(Operation operation) { return GenerateBinaryInfix(operation, "<<", type, type, Type::Uint); } - std::string ILogicalShiftRight(Operation operation) { - const std::string op_a = VisitOperand(operation, 0, Type::Uint); - const std::string op_b = VisitOperand(operation, 1, Type::Uint); - const std::string op_str = fmt::format("int({} >> {})", op_a, op_b); + Expression ILogicalShiftRight(Operation operation) { + const std::string op_a = VisitOperand(operation, 0).AsUint(); + const std::string op_b = VisitOperand(operation, 1).AsUint(); + std::string op_str = fmt::format("int({} >> {})", op_a, op_b); - return ApplyPrecise(operation, BitwiseCastResult(op_str, Type::Int)); + return ApplyPrecise(operation, std::move(op_str), Type::Int); } - std::string IArithmeticShiftRight(Operation operation) { + Expression IArithmeticShiftRight(Operation operation) { return GenerateBinaryInfix(operation, ">>", Type::Int, Type::Int, Type::Uint); } template <Type type> - std::string BitwiseAnd(Operation operation) { + Expression BitwiseAnd(Operation operation) { return GenerateBinaryInfix(operation, "&", type, type, type); } template <Type type> - std::string BitwiseOr(Operation operation) { + Expression BitwiseOr(Operation operation) { return GenerateBinaryInfix(operation, "|", type, type, type); } template <Type type> - std::string BitwiseXor(Operation operation) { + Expression BitwiseXor(Operation operation) { return GenerateBinaryInfix(operation, "^", type, type, type); } template <Type type> - std::string BitwiseNot(Operation operation) { - return GenerateUnary(operation, "~", type, type, false); + Expression BitwiseNot(Operation operation) { + return GenerateUnary(operation, "~", type, type); } - std::string UCastFloat(Operation operation) { - return GenerateUnary(operation, "uint", Type::Uint, Type::Float, false); + Expression UCastFloat(Operation operation) { + return GenerateUnary(operation, "uint", Type::Uint, Type::Float); } - std::string UCastSigned(Operation operation) { - return GenerateUnary(operation, "uint", Type::Uint, Type::Int, false); + Expression UCastSigned(Operation operation) { + return GenerateUnary(operation, "uint", Type::Uint, Type::Int); } - std::string UShiftRight(Operation operation) { + Expression UShiftRight(Operation operation) { return GenerateBinaryInfix(operation, ">>", Type::Uint, Type::Uint, Type::Uint); } template <Type type> - std::string BitfieldInsert(Operation operation) { + Expression BitfieldInsert(Operation operation) { return GenerateQuaternary(operation, "bitfieldInsert", type, type, type, Type::Int, Type::Int); } template <Type type> - std::string BitfieldExtract(Operation operation) { + Expression BitfieldExtract(Operation operation) { return GenerateTernary(operation, "bitfieldExtract", type, type, Type::Int, Type::Int); } template <Type type> - std::string BitCount(Operation operation) { - return GenerateUnary(operation, "bitCount", type, type, false); + Expression BitCount(Operation operation) { + return GenerateUnary(operation, "bitCount", type, type); } - std::string HNegate(Operation operation) { + Expression HNegate(Operation operation) { const auto GetNegate = [&](std::size_t index) { - return VisitOperand(operation, index, Type::Bool) + " ? -1 : 1"; + return VisitOperand(operation, index).AsBool() + " ? -1 : 1"; }; - const std::string value = - fmt::format("({} * vec2({}, {}))", VisitOperand(operation, 0, Type::HalfFloat), - GetNegate(1), GetNegate(2)); - return BitwiseCastResult(value, Type::HalfFloat); - } - - std::string HClamp(Operation operation) { - const std::string value = VisitOperand(operation, 0, Type::HalfFloat); - const std::string min = VisitOperand(operation, 1, Type::Float); - const std::string max = VisitOperand(operation, 2, Type::Float); - const std::string clamped = fmt::format("clamp({}, vec2({}), vec2({}))", value, min, max); - - return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat)); - } - - std::string HUnpack(Operation operation) { - const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)}; - const auto value = [&]() -> std::string { - switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { - case Tegra::Shader::HalfType::H0_H1: - return operand; - case Tegra::Shader::HalfType::F32: - return fmt::format("vec2(fromHalf2({}))", operand); - case Tegra::Shader::HalfType::H0_H0: - return fmt::format("vec2({}[0])", operand); - case Tegra::Shader::HalfType::H1_H1: - return fmt::format("vec2({}[1])", operand); - } - UNREACHABLE(); - return "0"; - }(); - return fmt::format("fromHalf2({})", value); + return {fmt::format("({} * vec2({}, {}))", VisitOperand(operation, 0).AsHalfFloat(), + GetNegate(1), GetNegate(2)), + Type::HalfFloat}; + } + + Expression HClamp(Operation operation) { + const std::string value = VisitOperand(operation, 0).AsHalfFloat(); + const std::string min = VisitOperand(operation, 1).AsFloat(); + const std::string max = VisitOperand(operation, 2).AsFloat(); + std::string clamped = fmt::format("clamp({}, vec2({}), vec2({}))", value, min, max); + + return ApplyPrecise(operation, std::move(clamped), Type::HalfFloat); + } + + Expression HCastFloat(Operation operation) { + return {fmt::format("vec2({})", VisitOperand(operation, 0).AsFloat()), Type::HalfFloat}; } - std::string HMergeF32(Operation operation) { - return fmt::format("float(toHalf2({})[0])", Visit(operation[0])); + Expression HUnpack(Operation operation) { + Expression operand = VisitOperand(operation, 0); + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: + return {fmt::format("vec2({})", operand.AsFloat()), Type::HalfFloat}; + case Tegra::Shader::HalfType::H0_H0: + return {fmt::format("vec2({}[0])", operand.AsHalfFloat()), Type::HalfFloat}; + case Tegra::Shader::HalfType::H1_H1: + return {fmt::format("vec2({}[1])", operand.AsHalfFloat()), Type::HalfFloat}; + } + } + + Expression HMergeF32(Operation operation) { + return {fmt::format("float({}[0])", VisitOperand(operation, 0).AsHalfFloat()), Type::Float}; } - std::string HMergeH0(Operation operation) { - return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[1]), - Visit(operation[0])); + Expression HMergeH0(Operation operation) { + std::string dest = VisitOperand(operation, 0).AsUint(); + std::string src = VisitOperand(operation, 1).AsUint(); + return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint}; } - std::string HMergeH1(Operation operation) { - return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[0]), - Visit(operation[1])); + Expression HMergeH1(Operation operation) { + std::string dest = VisitOperand(operation, 0).AsUint(); + std::string src = VisitOperand(operation, 1).AsUint(); + return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint}; } - std::string HPack2(Operation operation) { - return fmt::format("utof(packHalf2x16(vec2({}, {})))", Visit(operation[0]), - Visit(operation[1])); + Expression HPack2(Operation operation) { + return {fmt::format("vec2({}, {})", VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::HalfFloat}; } template <Type type> - std::string LogicalLessThan(Operation operation) { + Expression LogicalLessThan(Operation operation) { return GenerateBinaryInfix(operation, "<", Type::Bool, type, type); } template <Type type> - std::string LogicalEqual(Operation operation) { + Expression LogicalEqual(Operation operation) { return GenerateBinaryInfix(operation, "==", Type::Bool, type, type); } template <Type type> - std::string LogicalLessEqual(Operation operation) { + Expression LogicalLessEqual(Operation operation) { return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type); } template <Type type> - std::string LogicalGreaterThan(Operation operation) { + Expression LogicalGreaterThan(Operation operation) { return GenerateBinaryInfix(operation, ">", Type::Bool, type, type); } template <Type type> - std::string LogicalNotEqual(Operation operation) { + Expression LogicalNotEqual(Operation operation) { return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type); } template <Type type> - std::string LogicalGreaterEqual(Operation operation) { + Expression LogicalGreaterEqual(Operation operation) { return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type); } - std::string LogicalFIsNan(Operation operation) { - return GenerateUnary(operation, "isnan", Type::Bool, Type::Float, false); + Expression LogicalFIsNan(Operation operation) { + return GenerateUnary(operation, "isnan", Type::Bool, Type::Float); } - std::string LogicalAssign(Operation operation) { + Expression LogicalAssign(Operation operation) { const Node& dest = operation[0]; const Node& src = operation[1]; @@ -1276,82 +1593,80 @@ private: target = GetInternalFlag(flag->GetFlag()); } - code.AddLine("{} = {};", target, Visit(src)); + code.AddLine("{} = {};", target, Visit(src).AsBool()); return {}; } - std::string LogicalAnd(Operation operation) { + Expression LogicalAnd(Operation operation) { return GenerateBinaryInfix(operation, "&&", Type::Bool, Type::Bool, Type::Bool); } - std::string LogicalOr(Operation operation) { + Expression LogicalOr(Operation operation) { return GenerateBinaryInfix(operation, "||", Type::Bool, Type::Bool, Type::Bool); } - std::string LogicalXor(Operation operation) { + Expression LogicalXor(Operation operation) { return GenerateBinaryInfix(operation, "^^", Type::Bool, Type::Bool, Type::Bool); } - std::string LogicalNegate(Operation operation) { - return GenerateUnary(operation, "!", Type::Bool, Type::Bool, false); + Expression LogicalNegate(Operation operation) { + return GenerateUnary(operation, "!", Type::Bool, Type::Bool); } - std::string LogicalPick2(Operation operation) { - const std::string pair = VisitOperand(operation, 0, Type::Bool2); - return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint)); + Expression LogicalPick2(Operation operation) { + return {fmt::format("{}[{}]", VisitOperand(operation, 0).AsBool2(), + VisitOperand(operation, 1).AsUint()), + Type::Bool}; } - std::string LogicalAll2(Operation operation) { + Expression LogicalAnd2(Operation operation) { return GenerateUnary(operation, "all", Type::Bool, Type::Bool2); } - std::string LogicalAny2(Operation operation) { - return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); - } - template <bool with_nan> - std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) { - const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, - Type::HalfFloat, Type::HalfFloat)}; + Expression GenerateHalfComparison(Operation operation, std::string_view compare_op) { + Expression comparison = GenerateBinaryCall(operation, compare_op, Type::Bool2, + Type::HalfFloat, Type::HalfFloat); if constexpr (!with_nan) { return comparison; } - return fmt::format("halfFloatNanComparison({}, {}, {})", comparison, - VisitOperand(operation, 0, Type::HalfFloat), - VisitOperand(operation, 1, Type::HalfFloat)); + return {fmt::format("HalfFloatNanComparison({}, {}, {})", comparison.AsBool2(), + VisitOperand(operation, 0).AsHalfFloat(), + VisitOperand(operation, 1).AsHalfFloat()), + Type::Bool2}; } template <bool with_nan> - std::string Logical2HLessThan(Operation operation) { + Expression Logical2HLessThan(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "lessThan"); } template <bool with_nan> - std::string Logical2HEqual(Operation operation) { + Expression Logical2HEqual(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "equal"); } template <bool with_nan> - std::string Logical2HLessEqual(Operation operation) { + Expression Logical2HLessEqual(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "lessThanEqual"); } template <bool with_nan> - std::string Logical2HGreaterThan(Operation operation) { + Expression Logical2HGreaterThan(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "greaterThan"); } template <bool with_nan> - std::string Logical2HNotEqual(Operation operation) { + Expression Logical2HNotEqual(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "notEqual"); } template <bool with_nan> - std::string Logical2HGreaterEqual(Operation operation) { + Expression Logical2HGreaterEqual(Operation operation) { return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual"); } - std::string Texture(Operation operation) { + Expression Texture(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); @@ -1360,10 +1675,10 @@ private: if (meta->sampler.IsShadow()) { expr = "vec4(" + expr + ')'; } - return expr + GetSwizzle(meta->element); + return {expr + GetSwizzle(meta->element), Type::Float}; } - std::string TextureLod(Operation operation) { + Expression TextureLod(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); @@ -1372,54 +1687,54 @@ private: if (meta->sampler.IsShadow()) { expr = "vec4(" + expr + ')'; } - return expr + GetSwizzle(meta->element); + return {expr + GetSwizzle(meta->element), Type::Float}; } - std::string TextureGather(Operation operation) { + Expression TextureGather(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int; - return GenerateTexture(operation, "Gather", - {TextureArgument{type, meta->component}, TextureAoffi{}}) + - GetSwizzle(meta->element); + return {GenerateTexture(operation, "Gather", + {TextureArgument{type, meta->component}, TextureAoffi{}}) + + GetSwizzle(meta->element), + Type::Float}; } - std::string TextureQueryDimensions(Operation operation) { + Expression TextureQueryDimensions(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); const std::string sampler = GetSampler(meta->sampler); - const std::string lod = VisitOperand(operation, 0, Type::Int); + const std::string lod = VisitOperand(operation, 0).AsInt(); switch (meta->element) { case 0: case 1: - return fmt::format("itof(int(textureSize({}, {}){}))", sampler, lod, - GetSwizzle(meta->element)); - case 2: - return "0"; + return {fmt::format("textureSize({}, {}){}", sampler, lod, GetSwizzle(meta->element)), + Type::Int}; case 3: - return fmt::format("itof(textureQueryLevels({}))", sampler); + return {fmt::format("textureQueryLevels({})", sampler), Type::Int}; } UNREACHABLE(); - return "0"; + return {"0", Type::Int}; } - std::string TextureQueryLod(Operation operation) { + Expression TextureQueryLod(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); if (meta->element < 2) { - return fmt::format("itof(int(({} * vec2(256)){}))", - GenerateTexture(operation, "QueryLod", {}), - GetSwizzle(meta->element)); + return {fmt::format("int(({} * vec2(256)){})", + GenerateTexture(operation, "QueryLod", {}), + GetSwizzle(meta->element)), + Type::Int}; } - return "0"; + return {"0", Type::Int}; } - std::string TexelFetch(Operation operation) { - constexpr std::array<const char*, 4> constructors = {"int", "ivec2", "ivec3", "ivec4"}; + Expression TexelFetch(Operation operation) { + constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"}; const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); UNIMPLEMENTED_IF(meta->sampler.IsArray()); @@ -1432,60 +1747,117 @@ private: expr += constructors.at(operation.GetOperandsCount() - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += VisitOperand(operation, i, Type::Int); + expr += VisitOperand(operation, i).AsInt(); const std::size_t next = i + 1; if (next == count) expr += ')'; else if (next < count) expr += ", "; } + + // Store a copy of the expression without the lod to be used with texture buffers + std::string expr_buffer = expr; + if (meta->lod) { expr += ", "; - expr += CastOperand(Visit(meta->lod), Type::Int); + expr += Visit(meta->lod).AsInt(); } expr += ')'; + expr += GetSwizzle(meta->element); + + expr_buffer += ')'; + expr_buffer += GetSwizzle(meta->element); - return expr + GetSwizzle(meta->element); + const std::string tmp{code.GenerateTemporary()}; + EmitIfdefIsBuffer(meta->sampler); + code.AddLine("float {} = {};", tmp, expr_buffer); + code.AddLine("#else"); + code.AddLine("float {} = {};", tmp, expr); + code.AddLine("#endif"); + + return {tmp, Type::Float}; + } + + Expression ImageStore(Operation operation) { + const auto meta{std::get<MetaImage>(operation.GetMeta())}; + code.AddLine("imageStore({}, {}, {});", GetImage(meta.image), + BuildIntegerCoordinates(operation), BuildImageValues(operation)); + return {}; + } + + Expression AtomicImageAdd(Operation operation) { + return AtomicImage(operation, "imageAtomicAdd"); + } + + Expression AtomicImageMin(Operation operation) { + return AtomicImage(operation, "imageAtomicMin"); + } + + Expression AtomicImageMax(Operation operation) { + return AtomicImage(operation, "imageAtomicMax"); + } + Expression AtomicImageAnd(Operation operation) { + return AtomicImage(operation, "imageAtomicAnd"); } - std::string Branch(Operation operation) { + Expression AtomicImageOr(Operation operation) { + return AtomicImage(operation, "imageAtomicOr"); + } + + Expression AtomicImageXor(Operation operation) { + return AtomicImage(operation, "imageAtomicXor"); + } + + Expression AtomicImageExchange(Operation operation) { + return AtomicImage(operation, "imageAtomicExchange"); + } + + Expression Branch(Operation operation) { const auto target = std::get_if<ImmediateNode>(&*operation[0]); UNIMPLEMENTED_IF(!target); - code.AddLine("jmp_to = 0x{:x}u;", target->GetValue()); + code.AddLine("jmp_to = 0x{:X}U;", target->GetValue()); code.AddLine("break;"); return {}; } - std::string PushFlowStack(Operation operation) { + Expression BranchIndirect(Operation operation) { + const std::string op_a = VisitOperand(operation, 0).AsUint(); + + code.AddLine("jmp_to = {};", op_a); + code.AddLine("break;"); + return {}; + } + + Expression PushFlowStack(Operation operation) { const auto stack = std::get<MetaStackClass>(operation.GetMeta()); const auto target = std::get_if<ImmediateNode>(&*operation[0]); UNIMPLEMENTED_IF(!target); - code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack), + code.AddLine("{}[{}++] = 0x{:X}U;", FlowStackName(stack), FlowStackTopName(stack), target->GetValue()); return {}; } - std::string PopFlowStack(Operation operation) { + Expression PopFlowStack(Operation operation) { const auto stack = std::get<MetaStackClass>(operation.GetMeta()); code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack)); code.AddLine("break;"); return {}; } - std::string Exit(Operation operation) { - if (stage != ShaderStage::Fragment) { + Expression Exit(Operation operation) { + if (stage != ProgramType::Fragment) { code.AddLine("return;"); return {}; } const auto& used_registers = ir.GetRegisters(); - const auto SafeGetRegister = [&](u32 reg) -> std::string { + const auto SafeGetRegister = [&](u32 reg) -> Expression { // TODO(Rodrigo): Replace with contains once C++20 releases if (used_registers.find(reg) != used_registers.end()) { - return GetRegister(reg); + return {GetRegister(reg), Type::Float}; } - return "0.0f"; + return {"0.0f", Type::Float}; }; UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented"); @@ -1498,7 +1870,7 @@ private: for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { code.AddLine("FragColor{}[{}] = {};", render_target, component, - SafeGetRegister(current_reg)); + SafeGetRegister(current_reg).AsFloat()); ++current_reg; } } @@ -1507,14 +1879,14 @@ private: if (header.ps.omap.depth) { // The depth output is always 2 registers after the last color output, and current_reg // already contains one past the last color register. - code.AddLine("gl_FragDepth = {};", SafeGetRegister(current_reg + 1)); + code.AddLine("gl_FragDepth = {};", SafeGetRegister(current_reg + 1).AsFloat()); } code.AddLine("return;"); return {}; } - std::string Discard(Operation operation) { + Expression Discard(Operation operation) { // Enclose "discard" in a conditional, so that GLSL compilation does not complain // about unexecuted instructions that may follow this. code.AddLine("if (true) {{"); @@ -1525,8 +1897,8 @@ private: return {}; } - std::string EmitVertex(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + Expression EmitVertex(Operation operation) { + ASSERT_MSG(stage == ProgramType::Geometry, "EmitVertex is expected to be used in a geometry shader."); // If a geometry shader is attached, it will always flip (it's the last stage before @@ -1536,30 +1908,72 @@ private: return {}; } - std::string EndPrimitive(Operation operation) { - ASSERT_MSG(stage == ShaderStage::Geometry, + Expression EndPrimitive(Operation operation) { + ASSERT_MSG(stage == ProgramType::Geometry, "EndPrimitive is expected to be used in a geometry shader."); code.AddLine("EndPrimitive();"); return {}; } - std::string YNegate(Operation operation) { + Expression YNegate(Operation operation) { // Config pack's third value is Y_NEGATE's state. - return "uintBitsToFloat(config_pack[2])"; + return {"config_pack[2]", Type::Uint}; } template <u32 element> - std::string LocalInvocationId(Operation) { - return "utof(gl_LocalInvocationID"s + GetSwizzle(element) + ')'; + Expression LocalInvocationId(Operation) { + return {"gl_LocalInvocationID"s + GetSwizzle(element), Type::Uint}; } template <u32 element> - std::string WorkGroupId(Operation) { - return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')'; + Expression WorkGroupId(Operation) { + return {"gl_WorkGroupID"s + GetSwizzle(element), Type::Uint}; + } + + Expression BallotThread(Operation operation) { + const std::string value = VisitOperand(operation, 0).AsBool(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia warp intrinsics are not available and its required by a shader"); + // Stub on non-Nvidia devices by simulating all threads voting the same as the active + // one. + return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint}; + } + return {fmt::format("ballotThreadNV({})", value), Type::Uint}; + } + + Expression Vote(Operation operation, const char* func) { + const std::string value = VisitOperand(operation, 0).AsBool(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia vote intrinsics are not available and its required by a shader"); + // Stub with a warp size of one. + return {value, Type::Bool}; + } + return {fmt::format("{}({})", func, value), Type::Bool}; + } + + Expression VoteAll(Operation operation) { + return Vote(operation, "allThreadsNV"); } - static constexpr OperationDecompilersArray operation_decompilers = { + Expression VoteAny(Operation operation) { + return Vote(operation, "anyThreadNV"); + } + + Expression VoteEqual(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "Nvidia vote intrinsics are not available and its required by a shader"); + // We must return true here since a stub for a theoretical warp size of 1 will always + // return an equal result for all its votes. + return {"true", Type::Bool}; + } + return Vote(operation, "allThreadsEqualNV"); + } + + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, &GLSLDecompiler::Select, @@ -1571,6 +1985,8 @@ private: &GLSLDecompiler::Negate<Type::Float>, &GLSLDecompiler::Absolute<Type::Float>, &GLSLDecompiler::FClamp, + &GLSLDecompiler::FCastHalf0, + &GLSLDecompiler::FCastHalf1, &GLSLDecompiler::Min<Type::Float>, &GLSLDecompiler::Max<Type::Float>, &GLSLDecompiler::FCos, @@ -1631,6 +2047,7 @@ private: &GLSLDecompiler::Absolute<Type::HalfFloat>, &GLSLDecompiler::HNegate, &GLSLDecompiler::HClamp, + &GLSLDecompiler::HCastFloat, &GLSLDecompiler::HUnpack, &GLSLDecompiler::HMergeF32, &GLSLDecompiler::HMergeH0, @@ -1643,8 +2060,7 @@ private: &GLSLDecompiler::LogicalXor, &GLSLDecompiler::LogicalNegate, &GLSLDecompiler::LogicalPick2, - &GLSLDecompiler::LogicalAll2, - &GLSLDecompiler::LogicalAny2, + &GLSLDecompiler::LogicalAnd2, &GLSLDecompiler::LogicalLessThan<Type::Float>, &GLSLDecompiler::LogicalEqual<Type::Float>, @@ -1688,7 +2104,17 @@ private: &GLSLDecompiler::TextureQueryLod, &GLSLDecompiler::TexelFetch, + &GLSLDecompiler::ImageStore, + &GLSLDecompiler::AtomicImageAdd, + &GLSLDecompiler::AtomicImageMin, + &GLSLDecompiler::AtomicImageMax, + &GLSLDecompiler::AtomicImageAnd, + &GLSLDecompiler::AtomicImageOr, + &GLSLDecompiler::AtomicImageXor, + &GLSLDecompiler::AtomicImageExchange, + &GLSLDecompiler::Branch, + &GLSLDecompiler::BranchIndirect, &GLSLDecompiler::PushFlowStack, &GLSLDecompiler::PopFlowStack, &GLSLDecompiler::Exit, @@ -1704,7 +2130,13 @@ private: &GLSLDecompiler::WorkGroupId<0>, &GLSLDecompiler::WorkGroupId<1>, &GLSLDecompiler::WorkGroupId<2>, + + &GLSLDecompiler::BallotThread, + &GLSLDecompiler::VoteAll, + &GLSLDecompiler::VoteAny, + &GLSLDecompiler::VoteEqual, }; + static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { return GetDeclarationWithSuffix(index, "gpr"); @@ -1744,8 +2176,8 @@ private: } std::string GetInternalFlag(InternalFlag flag) const { - constexpr std::array<const char*, 4> InternalFlagNames = {"zero_flag", "sign_flag", - "carry_flag", "overflow_flag"}; + constexpr std::array InternalFlagNames = {"zero_flag", "sign_flag", "carry_flag", + "overflow_flag"}; const auto index = static_cast<u32>(flag); ASSERT(index < static_cast<u32>(InternalFlag::Amount)); @@ -1756,12 +2188,20 @@ private: return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); } + std::string GetImage(const Image& image) const { + return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image"); + } + + void EmitIfdefIsBuffer(const Sampler& sampler) { + code.AddLine("#ifdef SAMPLER_{}_IS_BUFFER", sampler.GetIndex()); + } + std::string GetDeclarationWithSuffix(u32 index, const std::string& name) const { return fmt::format("{}_{}_{}", name, index, suffix); } u32 GetNumPhysicalInputAttributes() const { - return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); + return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); } u32 GetNumPhysicalAttributes() const { @@ -1774,7 +2214,7 @@ private: const Device& device; const ShaderIR& ir; - const ShaderStage stage; + const ProgramType stage; const std::string suffix; const Header header; @@ -1785,27 +2225,19 @@ private: std::string GetCommonDeclarations() { return fmt::format( - "#define MAX_CONSTBUFFER_ELEMENTS {}\n" "#define ftoi floatBitsToInt\n" "#define ftou floatBitsToUint\n" "#define itof intBitsToFloat\n" "#define utof uintBitsToFloat\n\n" - "float fromHalf2(vec2 pair) {{\n" - " return utof(packHalf2x16(pair));\n" - "}}\n\n" - "vec2 toHalf2(float value) {{\n" - " return unpackHalf2x16(ftou(value));\n" - "}}\n\n" - "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n" + "bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n" " bvec2 is_nan1 = isnan(pair1);\n" " bvec2 is_nan2 = isnan(pair2);\n" " return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || " "is_nan2.y);\n" - "}}\n", - MAX_CONSTBUFFER_ELEMENTS); + "}}\n\n"); } -ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage, +ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage, const std::string& suffix) { GLSLDecompiler decompiler(device, ir, stage, suffix); decompiler.Decompile(); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index c1569e737..2ea02f5bf 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -12,14 +12,26 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/shader/shader_ir.h" -namespace OpenGL { -class Device; -} - namespace VideoCommon::Shader { class ShaderIR; } +namespace OpenGL { + +class Device; + +enum class ProgramType : u32 { + VertexA = 0, + VertexB = 1, + TessellationControl = 2, + TessellationEval = 3, + Geometry = 4, + Fragment = 5, + Compute = 6 +}; + +} // namespace OpenGL + namespace OpenGL::GLShader { struct ShaderEntries; @@ -27,6 +39,7 @@ struct ShaderEntries; using Maxwell = Tegra::Engines::Maxwell3D::Regs; using ProgramResult = std::pair<std::string, ShaderEntries>; using SamplerEntry = VideoCommon::Shader::Sampler; +using ImageEntry = VideoCommon::Shader::Image; class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer { public: @@ -74,14 +87,16 @@ struct ShaderEntries { std::vector<ConstBufferEntry> const_buffers; std::vector<SamplerEntry> samplers; std::vector<SamplerEntry> bindless_samplers; + std::vector<ImageEntry> images; std::vector<GlobalMemoryEntry> global_memory_entries; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + bool shader_viewport_layer_array{}; std::size_t shader_length{}; }; std::string GetCommonDeclarations(); ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, - Maxwell::ShaderStage stage, const std::string& suffix); + ProgramType stage, const std::string& suffix); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index ee4a45ca2..f141c4e3b 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -34,11 +34,11 @@ enum class PrecompiledEntryKind : u32 { Dump, }; -constexpr u32 NativeVersion = 1; +constexpr u32 NativeVersion = 4; // Making sure sizes doesn't change by accident -static_assert(sizeof(BaseBindings) == 12); -static_assert(sizeof(ShaderDiskCacheUsage) == 24); +static_assert(sizeof(BaseBindings) == 16); +static_assert(sizeof(ShaderDiskCacheUsage) == 40); namespace { @@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { } // namespace -ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, +ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b) : unique_identifier{unique_identifier}, program_type{program_type}, @@ -332,11 +332,37 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn static_cast<Tegra::Shader::TextureType>(type), is_array, is_shadow, is_bindless); } + u32 images_count{}; + if (!LoadObjectFromPrecompiled(images_count)) { + return {}; + } + for (u32 i = 0; i < images_count; ++i) { + u64 offset{}; + u64 index{}; + u32 type{}; + u8 is_bindless{}; + u8 is_written{}; + u8 is_read{}; + u8 is_size_known{}; + u32 size{}; + if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) || + !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_bindless) || + !LoadObjectFromPrecompiled(is_written) || !LoadObjectFromPrecompiled(is_read) || + !LoadObjectFromPrecompiled(is_size_known) || !LoadObjectFromPrecompiled(size)) { + return {}; + } + entry.entries.images.emplace_back( + static_cast<std::size_t>(offset), static_cast<std::size_t>(index), + static_cast<Tegra::Shader::ImageType>(type), is_bindless != 0, is_written != 0, + is_read != 0, + is_size_known ? std::make_optional(static_cast<Tegra::Shader::ImageAtomicSize>(size)) + : std::nullopt); + } + u32 global_memory_count{}; if (!LoadObjectFromPrecompiled(global_memory_count)) { return {}; } - for (u32 i = 0; i < global_memory_count; ++i) { u32 cbuf_index{}; u32 cbuf_offset{}; @@ -356,11 +382,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn } } + bool shader_viewport_layer_array{}; + if (!LoadObjectFromPrecompiled(shader_viewport_layer_array)) { + return {}; + } + entry.entries.shader_viewport_layer_array = shader_viewport_layer_array; + u64 shader_length{}; if (!LoadObjectFromPrecompiled(shader_length)) { return {}; } - entry.entries.shader_length = static_cast<std::size_t>(shader_length); return entry; @@ -400,6 +431,22 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std: } } + if (!SaveObjectToPrecompiled(static_cast<u32>(entries.images.size()))) { + return false; + } + for (const auto& image : entries.images) { + const u32 size = image.IsSizeKnown() ? static_cast<u32>(image.GetSize()) : 0U; + if (!SaveObjectToPrecompiled(static_cast<u64>(image.GetOffset())) || + !SaveObjectToPrecompiled(static_cast<u64>(image.GetIndex())) || + !SaveObjectToPrecompiled(static_cast<u32>(image.GetType())) || + !SaveObjectToPrecompiled(static_cast<u8>(image.IsBindless() ? 1 : 0)) || + !SaveObjectToPrecompiled(static_cast<u8>(image.IsWritten() ? 1 : 0)) || + !SaveObjectToPrecompiled(static_cast<u8>(image.IsRead() ? 1 : 0)) || + !SaveObjectToPrecompiled(image.IsSizeKnown()) || !SaveObjectToPrecompiled(size)) { + return false; + } + } + if (!SaveObjectToPrecompiled(static_cast<u32>(entries.global_memory_entries.size()))) { return false; } @@ -417,6 +464,10 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std: } } + if (!SaveObjectToPrecompiled(entries.shader_viewport_layer_array)) { + return false; + } + if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) { return false; } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index ecd72ba58..cc8bbd61e 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -4,6 +4,7 @@ #pragma once +#include <bitset> #include <optional> #include <string> #include <tuple> @@ -17,7 +18,6 @@ #include "common/assert.h" #include "common/common_types.h" #include "core/file_sys/vfs_vector.h" -#include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_gen.h" namespace Core { @@ -30,22 +30,23 @@ class IOFile; namespace OpenGL { -using ProgramCode = std::vector<u64>; -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - struct ShaderDiskCacheUsage; struct ShaderDiskCacheDump; +using ProgramCode = std::vector<u64>; using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; +using TextureBufferUsage = std::bitset<64>; /// Allocated bindings used by an OpenGL shader program struct BaseBindings { u32 cbuf{}; u32 gmem{}; u32 sampler{}; + u32 image{}; bool operator==(const BaseBindings& rhs) const { - return std::tie(cbuf, gmem, sampler) == std::tie(rhs.cbuf, rhs.gmem, rhs.sampler); + return std::tie(cbuf, gmem, sampler, image) == + std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image); } bool operator!=(const BaseBindings& rhs) const { @@ -53,15 +54,29 @@ struct BaseBindings { } }; -/// Describes how a shader is used +/// Describes the different variants a single program can be compiled. +struct ProgramVariant { + BaseBindings base_bindings; + GLenum primitive_mode{}; + TextureBufferUsage texture_buffer_usage{}; + + bool operator==(const ProgramVariant& rhs) const { + return std::tie(base_bindings, primitive_mode, texture_buffer_usage) == + std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.texture_buffer_usage); + } + + bool operator!=(const ProgramVariant& rhs) const { + return !operator==(rhs); + } +}; + +/// Describes how a shader is used. struct ShaderDiskCacheUsage { u64 unique_identifier{}; - BaseBindings bindings; - GLenum primitive{}; + ProgramVariant variant; bool operator==(const ShaderDiskCacheUsage& rhs) const { - return std::tie(unique_identifier, bindings, primitive) == - std::tie(rhs.unique_identifier, rhs.bindings, rhs.primitive); + return std::tie(unique_identifier, variant) == std::tie(rhs.unique_identifier, rhs.variant); } bool operator!=(const ShaderDiskCacheUsage& rhs) const { @@ -76,7 +91,19 @@ namespace std { template <> struct hash<OpenGL::BaseBindings> { std::size_t operator()(const OpenGL::BaseBindings& bindings) const noexcept { - return bindings.cbuf | bindings.gmem << 8 | bindings.sampler << 16; + return static_cast<std::size_t>(bindings.cbuf) ^ + (static_cast<std::size_t>(bindings.gmem) << 8) ^ + (static_cast<std::size_t>(bindings.sampler) << 16) ^ + (static_cast<std::size_t>(bindings.image) << 24); + } +}; + +template <> +struct hash<OpenGL::ProgramVariant> { + std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept { + return std::hash<OpenGL::BaseBindings>()(variant.base_bindings) ^ + std::hash<OpenGL::TextureBufferUsage>()(variant.texture_buffer_usage) ^ + (static_cast<std::size_t>(variant.primitive_mode) << 6); } }; @@ -84,7 +111,7 @@ template <> struct hash<OpenGL::ShaderDiskCacheUsage> { std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept { return static_cast<std::size_t>(usage.unique_identifier) ^ - std::hash<OpenGL::BaseBindings>()(usage.bindings) ^ usage.primitive << 16; + std::hash<OpenGL::ProgramVariant>()(usage.variant); } }; @@ -95,7 +122,7 @@ namespace OpenGL { /// Describes a shader how it's used by the guest GPU class ShaderDiskCacheRaw { public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, + explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type, u32 program_code_size, u32 program_code_size_b, ProgramCode program_code, ProgramCode program_code_b); ShaderDiskCacheRaw(); @@ -110,30 +137,13 @@ public: } bool HasProgramA() const { - return program_type == Maxwell::ShaderProgram::VertexA; + return program_type == ProgramType::VertexA; } - Maxwell::ShaderProgram GetProgramType() const { + ProgramType GetProgramType() const { return program_type; } - Maxwell::ShaderStage GetProgramStage() const { - switch (program_type) { - case Maxwell::ShaderProgram::VertexA: - case Maxwell::ShaderProgram::VertexB: - return Maxwell::ShaderStage::Vertex; - case Maxwell::ShaderProgram::TesselationControl: - return Maxwell::ShaderStage::TesselationControl; - case Maxwell::ShaderProgram::TesselationEval: - return Maxwell::ShaderStage::TesselationEval; - case Maxwell::ShaderProgram::Geometry: - return Maxwell::ShaderStage::Geometry; - case Maxwell::ShaderProgram::Fragment: - return Maxwell::ShaderStage::Fragment; - } - UNREACHABLE(); - } - const ProgramCode& GetProgramCode() const { return program_code; } @@ -144,7 +154,7 @@ public: private: u64 unique_identifier{}; - Maxwell::ShaderProgram program_type{}; + ProgramType program_type{}; u32 program_code_size{}; u32 program_code_size_b{}; @@ -275,26 +285,17 @@ private: return LoadArrayFromPrecompiled(&object, 1); } - bool LoadObjectFromPrecompiled(bool& object) { - u8 value; - const bool read_ok = LoadArrayFromPrecompiled(&value, 1); - if (!read_ok) { - return false; - } - - object = value != 0; - return true; - } - - // Core system Core::System& system; - // Stored transferable shaders - std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; - // Stores whole precompiled cache which will be read from/saved to the precompiled cache file + + // Stores whole precompiled cache which will be read from or saved to the precompiled chache + // file FileSys::VectorVfsFile precompiled_cache_virtual_file; // Stores the current offset of the precompiled cache file for IO purposes std::size_t precompiled_cache_virtual_file_offset = 0; + // Stored transferable shaders + std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; + // The cache has been loaded at boot bool tried_to_load{}; }; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 9148629ec..3a8d9e1da 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D; using VideoCommon::Shader::ProgramCode; using VideoCommon::Shader::ShaderIR; -static constexpr u32 PROGRAM_OFFSET{10}; +static constexpr u32 PROGRAM_OFFSET = 10; +static constexpr u32 COMPUTE_OFFSET = 0; ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); @@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB; + ProgramResult program = Decompile(device, program_ir, stage, "vertex"); out += program.first; if (setup.IsDualProgram()) { - const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET); - ProgramResult program_b = - Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); - + const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b); + ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b"); out += program_b.first; } @@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); + + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry"); out += program.first; out += R"( @@ -115,10 +114,8 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config { }; )"; - const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = - Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); - + const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment"); out += program.first; out += R"( @@ -130,4 +127,22 @@ void main() { return {std::move(out), std::move(program.second)}; } +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) { + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + + std::string out = "// Shader Unique Id: CS" + id + "\n\n"; + out += GetCommonDeclarations(); + + const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a); + ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute"); + out += program.first; + + out += R"( +void main() { + execute_compute(); +} +)"; + return {std::move(out), std::move(program.second)}; +} + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index 0536c8a03..3833e88ab 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -27,6 +27,8 @@ struct ShaderSetup { ProgramCode code; ProgramCode code_b; // Used for dual vertex shaders u64 unique_identifier; + std::size_t size_a; + std::size_t size_b; } program; /// Used in scenarios where we have a dual vertex shaders @@ -52,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se /// Generates the GLSL fragment shader program source code for the given FS program ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup); +/// Generates the GLSL compute shader program source code for the given CS program +ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup); + } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 5f3fe067e..9e74eda0d 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -10,21 +10,25 @@ namespace OpenGL::GLShader { -GLuint LoadShader(const char* source, GLenum type) { - const char* debug_type; +namespace { +const char* GetStageDebugName(GLenum type) { switch (type) { case GL_VERTEX_SHADER: - debug_type = "vertex"; - break; + return "vertex"; case GL_GEOMETRY_SHADER: - debug_type = "geometry"; - break; + return "geometry"; case GL_FRAGMENT_SHADER: - debug_type = "fragment"; - break; - default: - UNREACHABLE(); + return "fragment"; + case GL_COMPUTE_SHADER: + return "compute"; } + UNIMPLEMENTED(); + return "unknown"; +} +} // Anonymous namespace + +GLuint LoadShader(const char* source, GLenum type) { + const char* debug_type = GetStageDebugName(type); const GLuint shader_id = glCreateShader(type); glShaderSource(shader_id, 1, &source, nullptr); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index d86e137ac..6eabf4fac 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -6,8 +6,11 @@ #include <glad/glad.h> #include "common/assert.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "video_core/renderer_opengl/gl_state.h" +MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128)); + namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; @@ -31,6 +34,25 @@ bool UpdateTie(T1 current_value, const T2 new_value) { return changed; } +template <typename T> +std::optional<std::pair<GLuint, GLsizei>> UpdateArray(T& current_values, const T& new_values) { + std::optional<std::size_t> first; + std::size_t last; + for (std::size_t i = 0; i < std::size(current_values); ++i) { + if (!UpdateValue(current_values[i], new_values[i])) { + continue; + } + if (!first) { + first = i; + } + last = i; + } + if (!first) { + return std::nullopt; + } + return std::make_pair(static_cast<GLuint>(*first), static_cast<GLsizei>(last - *first + 1)); +} + void Enable(GLenum cap, bool enable) { if (enable) { glEnable(cap); @@ -131,10 +153,6 @@ OpenGLState::OpenGLState() { logic_op.enabled = false; logic_op.operation = GL_COPY; - for (auto& texture_unit : texture_units) { - texture_unit.Reset(); - } - draw.read_framebuffer = 0; draw.draw_framebuffer = 0; draw.vertex_array = 0; @@ -162,6 +180,25 @@ OpenGLState::OpenGLState() { alpha_test.ref = 0.0f; } +void OpenGLState::SetDefaultViewports() { + for (auto& item : viewports) { + item.x = 0; + item.y = 0; + item.width = 0; + item.height = 0; + item.depth_range_near = 0.0f; + item.depth_range_far = 1.0f; + item.scissor.enabled = false; + item.scissor.x = 0; + item.scissor.y = 0; + item.scissor.width = 0; + item.scissor.height = 0; + } + + depth_clamp.far_plane = false; + depth_clamp.near_plane = false; +} + void OpenGLState::ApplyDefaultState() { glEnable(GL_BLEND); glDisable(GL_FRAMEBUFFER_SRGB); @@ -474,56 +511,25 @@ void OpenGLState::ApplyAlphaTest() const { } void OpenGLState::ApplyTextures() const { - bool has_delta{}; - std::size_t first{}; - std::size_t last{}; - std::array<GLuint, Maxwell::NumTextureSamplers> textures; - - for (std::size_t i = 0; i < std::size(texture_units); ++i) { - const auto& texture_unit = texture_units[i]; - auto& cur_state_texture_unit = cur_state.texture_units[i]; - textures[i] = texture_unit.texture; - if (cur_state_texture_unit.texture == textures[i]) { - continue; - } - cur_state_texture_unit.texture = textures[i]; - if (!has_delta) { - first = i; - has_delta = true; - } - last = i; - } - if (has_delta) { - glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), - textures.data() + first); + if (const auto update = UpdateArray(cur_state.textures, textures)) { + glBindTextures(update->first, update->second, textures.data() + update->first); } } void OpenGLState::ApplySamplers() const { - bool has_delta{}; - std::size_t first{}; - std::size_t last{}; - std::array<GLuint, Maxwell::NumTextureSamplers> samplers; - - for (std::size_t i = 0; i < std::size(samplers); ++i) { - samplers[i] = texture_units[i].sampler; - if (cur_state.texture_units[i].sampler == texture_units[i].sampler) { - continue; - } - cur_state.texture_units[i].sampler = texture_units[i].sampler; - if (!has_delta) { - first = i; - has_delta = true; - } - last = i; + if (const auto update = UpdateArray(cur_state.samplers, samplers)) { + glBindSamplers(update->first, update->second, samplers.data() + update->first); } - if (has_delta) { - glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), - samplers.data() + first); +} + +void OpenGLState::ApplyImages() const { + if (const auto update = UpdateArray(cur_state.images, images)) { + glBindImageTextures(update->first, update->second, images.data() + update->first); } } -void OpenGLState::Apply() const { +void OpenGLState::Apply() { + MICROPROFILE_SCOPE(OpenGL_State); ApplyFramebufferState(); ApplyVertexArrayState(); ApplyShaderProgram(); @@ -532,19 +538,32 @@ void OpenGLState::Apply() const { ApplyPointSize(); ApplyFragmentColorClamp(); ApplyMultisample(); + if (dirty.color_mask) { + ApplyColorMask(); + dirty.color_mask = false; + } ApplyDepthClamp(); - ApplyColorMask(); ApplyViewport(); - ApplyStencilTest(); + if (dirty.stencil_state) { + ApplyStencilTest(); + dirty.stencil_state = false; + } ApplySRgb(); ApplyCulling(); ApplyDepth(); ApplyPrimitiveRestart(); - ApplyBlending(); + if (dirty.blend_state) { + ApplyBlending(); + dirty.blend_state = false; + } ApplyLogicOp(); ApplyTextures(); ApplySamplers(); - ApplyPolygonOffset(); + ApplyImages(); + if (dirty.polygon_offset) { + ApplyPolygonOffset(); + dirty.polygon_offset = false; + } ApplyAlphaTest(); } @@ -571,18 +590,18 @@ void OpenGLState::EmulateViewportWithScissor() { } OpenGLState& OpenGLState::UnbindTexture(GLuint handle) { - for (auto& unit : texture_units) { - if (unit.texture == handle) { - unit.Unbind(); + for (auto& texture : textures) { + if (texture == handle) { + texture = 0; } } return *this; } OpenGLState& OpenGLState::ResetSampler(GLuint handle) { - for (auto& unit : texture_units) { - if (unit.sampler == handle) { - unit.sampler = 0; + for (auto& sampler : samplers) { + if (sampler == handle) { + sampler = 0; } } return *this; diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index b0140495d..949b13051 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -118,21 +118,9 @@ public: GLenum operation; } logic_op; - // 3 texture units - one for each that is used in PICA fragment shader emulation - struct TextureUnit { - GLuint texture; // GL_TEXTURE_BINDING_2D - GLuint sampler; // GL_SAMPLER_BINDING - - void Unbind() { - texture = 0; - } - - void Reset() { - Unbind(); - sampler = 0; - } - }; - std::array<TextureUnit, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> texture_units; + std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures{}; + std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers{}; + std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumImages> images{}; struct { GLuint read_framebuffer; // GL_READ_FRAMEBUFFER_BINDING @@ -195,8 +183,9 @@ public: s_rgb_used = false; } + void SetDefaultViewports(); /// Apply this state as the current OpenGL state - void Apply() const; + void Apply(); void ApplyFramebufferState() const; void ApplyVertexArrayState() const; @@ -219,6 +208,7 @@ public: void ApplyLogicOp() const; void ApplyTextures() const; void ApplySamplers() const; + void ApplyImages() const; void ApplyDepthClamp() const; void ApplyPolygonOffset() const; void ApplyAlphaTest() const; @@ -237,11 +227,41 @@ public: /// Viewport does not affects glClearBuffer so emulate viewport using scissor test void EmulateViewportWithScissor(); + void MarkDirtyBlendState() { + dirty.blend_state = true; + } + + void MarkDirtyStencilState() { + dirty.stencil_state = true; + } + + void MarkDirtyPolygonOffset() { + dirty.polygon_offset = true; + } + + void MarkDirtyColorMask() { + dirty.color_mask = true; + } + + void AllDirty() { + dirty.blend_state = true; + dirty.stencil_state = true; + dirty.polygon_offset = true; + dirty.color_mask = true; + } + private: static OpenGLState cur_state; // Workaround for sRGB problems caused by QT not supporting srgb output static bool s_rgb_used; + struct { + bool blend_state; + bool stencil_state; + bool viewport_state; + bool polygon_offset; + bool color_mask; + } dirty{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index d0b14b3f6..35ba334e4 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -15,7 +15,8 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent) +OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, + bool use_persistent) : buffer_size(size) { gl_buffer.Create(); @@ -29,7 +30,7 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - if (GLAD_GL_ARB_buffer_storage) { + if (use_persistent) { persistent = true; coherent = prefer_coherent; const GLbitfield flags = diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 3d18ecb4d..f8383cbd4 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -13,7 +13,8 @@ namespace OpenGL { class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false); + explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, + bool use_persistent = true); ~OGLStreamBuffer(); GLuint GetHandle() const; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp new file mode 100644 index 000000000..4f135fe03 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -0,0 +1,624 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "core/core.h" +#include "video_core/morton.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" +#include "video_core/renderer_opengl/utils.h" +#include "video_core/texture_cache/surface_base.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/textures/convert.h" +#include "video_core/textures/texture.h" + +namespace OpenGL { + +using Tegra::Texture::SwizzleSource; +using VideoCore::MortonSwizzleMode; + +using VideoCore::Surface::ComponentType; +using VideoCore::Surface::PixelFormat; +using VideoCore::Surface::SurfaceCompression; +using VideoCore::Surface::SurfaceTarget; +using VideoCore::Surface::SurfaceType; + +MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128)); +MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128)); +MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", + MP_RGB(128, 192, 128)); + +namespace { + +struct FormatTuple { + GLint internal_format; + GLenum format; + GLenum type; + ComponentType component_type; + bool compressed; +}; + +constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U + {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false}, // ABGR8S + {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // ABGR8UI + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false}, // B5G6R5U + {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm, + false}, // A2B10G10R10U + {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U + {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // R8U + {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // R8UI + {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false}, // RGBA16F + {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // RGBA16U + {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RGBA16UI + {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float, + false}, // R11FG11FB10F + {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI + {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT1 + {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT23 + {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT45 + {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN1 + {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXN2UNORM + {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM + {GL_COMPRESSED_RGBA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // BC7U + {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, + true}, // BC6H_UF16 + {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, + true}, // BC6H_SF16 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4 + {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // BGRA8 + {GL_RGBA32F, GL_RGBA, GL_FLOAT, ComponentType::Float, false}, // RGBA32F + {GL_RG32F, GL_RG, GL_FLOAT, ComponentType::Float, false}, // RG32F + {GL_R32F, GL_RED, GL_FLOAT, ComponentType::Float, false}, // R32F + {GL_R16F, GL_RED, GL_HALF_FLOAT, ComponentType::Float, false}, // R16F + {GL_R16, GL_RED, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // R16U + {GL_R16_SNORM, GL_RED, GL_SHORT, ComponentType::SNorm, false}, // R16S + {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // R16UI + {GL_R16I, GL_RED_INTEGER, GL_SHORT, ComponentType::SInt, false}, // R16I + {GL_RG16, GL_RG, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // RG16 + {GL_RG16F, GL_RG, GL_HALF_FLOAT, ComponentType::Float, false}, // RG16F + {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RG16UI + {GL_RG16I, GL_RG_INTEGER, GL_SHORT, ComponentType::SInt, false}, // RG16I + {GL_RG16_SNORM, GL_RG, GL_SHORT, ComponentType::SNorm, false}, // RG16S + {GL_RGB32F, GL_RGB, GL_FLOAT, ComponentType::Float, false}, // RGB32F + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, + false}, // RGBA8_SRGB + {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // RG8U + {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false}, // RG8S + {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RG32UI + {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // R32UI + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4 + {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // BGRA8 + // Compressed sRGB formats + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT1_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT23_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // DXT45_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // BC7U_SRGB + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4_SRGB + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8_SRGB + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5_SRGB + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4_SRGB + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5 + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5_SRGB + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8 + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8_SRGB + + // Depth formats + {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F + {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, ComponentType::UNorm, + false}, // Z16 + + // DepthStencil formats + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm, + false}, // Z24S8 + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm, + false}, // S8Z24 + {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV, + ComponentType::Float, false}, // Z32FS8 +}}; + +const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { + ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); + const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]}; + return format; +} + +GLenum GetTextureTarget(const SurfaceTarget& target) { + switch (target) { + case SurfaceTarget::TextureBuffer: + return GL_TEXTURE_BUFFER; + case SurfaceTarget::Texture1D: + return GL_TEXTURE_1D; + case SurfaceTarget::Texture2D: + return GL_TEXTURE_2D; + case SurfaceTarget::Texture3D: + return GL_TEXTURE_3D; + case SurfaceTarget::Texture1DArray: + return GL_TEXTURE_1D_ARRAY; + case SurfaceTarget::Texture2DArray: + return GL_TEXTURE_2D_ARRAY; + case SurfaceTarget::TextureCubemap: + return GL_TEXTURE_CUBE_MAP; + case SurfaceTarget::TextureCubeArray: + return GL_TEXTURE_CUBE_MAP_ARRAY; + } + UNREACHABLE(); + return {}; +} + +GLint GetSwizzleSource(SwizzleSource source) { + switch (source) { + case SwizzleSource::Zero: + return GL_ZERO; + case SwizzleSource::R: + return GL_RED; + case SwizzleSource::G: + return GL_GREEN; + case SwizzleSource::B: + return GL_BLUE; + case SwizzleSource::A: + return GL_ALPHA; + case SwizzleSource::OneInt: + case SwizzleSource::OneFloat: + return GL_ONE; + } + UNREACHABLE(); + return GL_NONE; +} + +void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { + if (params.IsBuffer()) { + return; + } + glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, params.num_levels - 1); + if (params.num_levels == 1) { + glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f); + } +} + +OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum internal_format, + OGLBuffer& texture_buffer) { + OGLTexture texture; + texture.Create(target); + + switch (params.target) { + case SurfaceTarget::Texture1D: + glTextureStorage1D(texture.handle, params.emulated_levels, internal_format, params.width); + break; + case SurfaceTarget::TextureBuffer: + texture_buffer.Create(); + glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(), + nullptr, GL_DYNAMIC_STORAGE_BIT); + glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); + break; + case SurfaceTarget::Texture2D: + case SurfaceTarget::TextureCubemap: + glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, + params.height); + break; + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glTextureStorage3D(texture.handle, params.emulated_levels, internal_format, params.width, + params.height, params.depth); + break; + default: + UNREACHABLE(); + } + + ApplyTextureDefaults(params, texture.handle); + + return texture; +} + +} // Anonymous namespace + +CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) + : VideoCommon::SurfaceBase<View>(gpu_addr, params) { + const auto& tuple{GetFormatTuple(params.pixel_format, params.component_type)}; + internal_format = tuple.internal_format; + format = tuple.format; + type = tuple.type; + is_compressed = tuple.compressed; + target = GetTextureTarget(params.target); + texture = CreateTexture(params, target, internal_format, texture_buffer); + DecorateSurfaceName(); + main_view = CreateViewInner( + ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), + true); +} + +CachedSurface::~CachedSurface() = default; + +void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { + MICROPROFILE_SCOPE(OpenGL_Texture_Download); + + SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); }); + + for (u32 level = 0; level < params.emulated_levels; ++level) { + glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level))); + glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); + const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level); + if (is_compressed) { + glGetCompressedTextureImage(texture.handle, level, + static_cast<GLsizei>(params.GetHostMipmapSize(level)), + staging_buffer.data() + mip_offset); + } else { + glGetTextureImage(texture.handle, level, format, type, + static_cast<GLsizei>(params.GetHostMipmapSize(level)), + staging_buffer.data() + mip_offset); + } + } +} + +void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { + MICROPROFILE_SCOPE(OpenGL_Texture_Upload); + SCOPE_EXIT({ glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); }); + for (u32 level = 0; level < params.emulated_levels; ++level) { + UploadTextureMipmap(level, staging_buffer); + } +} + +void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer) { + glPixelStorei(GL_UNPACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level))); + glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); + + auto compression_type = params.GetCompressionType(); + + const std::size_t mip_offset = compression_type == SurfaceCompression::Converted + ? params.GetConvertedMipmapOffset(level) + : params.GetHostMipmapLevelOffset(level); + const u8* buffer{staging_buffer.data() + mip_offset}; + if (is_compressed) { + const auto image_size{static_cast<GLsizei>(params.GetHostMipmapSize(level))}; + switch (params.target) { + case SurfaceTarget::Texture2D: + glCompressedTextureSubImage2D(texture.handle, level, 0, 0, + static_cast<GLsizei>(params.GetMipWidth(level)), + static_cast<GLsizei>(params.GetMipHeight(level)), + internal_format, image_size, buffer); + break; + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glCompressedTextureSubImage3D(texture.handle, level, 0, 0, 0, + static_cast<GLsizei>(params.GetMipWidth(level)), + static_cast<GLsizei>(params.GetMipHeight(level)), + static_cast<GLsizei>(params.GetMipDepth(level)), + internal_format, image_size, buffer); + break; + case SurfaceTarget::TextureCubemap: { + const std::size_t layer_size{params.GetHostLayerSize(level)}; + for (std::size_t face = 0; face < params.depth; ++face) { + glCompressedTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), + static_cast<GLsizei>(params.GetMipWidth(level)), + static_cast<GLsizei>(params.GetMipHeight(level)), 1, + internal_format, static_cast<GLsizei>(layer_size), + buffer); + buffer += layer_size; + } + break; + } + default: + UNREACHABLE(); + } + } else { + switch (params.target) { + case SurfaceTarget::Texture1D: + glTextureSubImage1D(texture.handle, level, 0, params.GetMipWidth(level), format, type, + buffer); + break; + case SurfaceTarget::TextureBuffer: + ASSERT(level == 0); + glNamedBufferSubData(texture_buffer.handle, 0, + params.GetMipWidth(level) * params.GetBytesPerPixel(), buffer); + break; + case SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture2D: + glTextureSubImage2D(texture.handle, level, 0, 0, params.GetMipWidth(level), + params.GetMipHeight(level), format, type, buffer); + break; + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glTextureSubImage3D( + texture.handle, level, 0, 0, 0, static_cast<GLsizei>(params.GetMipWidth(level)), + static_cast<GLsizei>(params.GetMipHeight(level)), + static_cast<GLsizei>(params.GetMipDepth(level)), format, type, buffer); + break; + case SurfaceTarget::TextureCubemap: + for (std::size_t face = 0; face < params.depth; ++face) { + glTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), + params.GetMipWidth(level), params.GetMipHeight(level), 1, + format, type, buffer); + buffer += params.GetHostLayerSize(level); + } + break; + default: + UNREACHABLE(); + } + } +} + +void CachedSurface::DecorateSurfaceName() { + LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName()); +} + +void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) { + LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix); +} + +View CachedSurface::CreateView(const ViewParams& view_key) { + return CreateViewInner(view_key, false); +} + +View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_proxy) { + auto view = std::make_shared<CachedSurfaceView>(*this, view_key, is_proxy); + views[view_key] = view; + if (!is_proxy) + view->DecorateViewName(gpu_addr, params.TargetName() + "V:" + std::to_string(view_count++)); + return view; +} + +CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, + const bool is_proxy) + : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { + target = GetTextureTarget(params.target); + if (!is_proxy) { + texture_view = CreateTextureView(); + } + swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A); +} + +CachedSurfaceView::~CachedSurfaceView() = default; + +void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { + ASSERT(params.num_layers == 1 && params.num_levels == 1); + + const auto& owner_params = surface.GetSurfaceParams(); + + switch (owner_params.target) { + case SurfaceTarget::Texture1D: + glFramebufferTexture1D(target, attachment, surface.GetTarget(), surface.GetTexture(), + params.base_level); + break; + case SurfaceTarget::Texture2D: + glFramebufferTexture2D(target, attachment, surface.GetTarget(), surface.GetTexture(), + params.base_level); + break; + case SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + glFramebufferTextureLayer(target, attachment, surface.GetTexture(), params.base_level, + params.base_layer); + break; + default: + UNIMPLEMENTED(); + } +} + +void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { + u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (new_swizzle == swizzle) + return; + swizzle = new_swizzle; + const std::array<GLint, 4> gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), + GetSwizzleSource(z_source), + GetSwizzleSource(w_source)}; + const GLuint handle = GetTexture(); + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); +} + +OGLTextureView CachedSurfaceView::CreateTextureView() const { + const auto& owner_params = surface.GetSurfaceParams(); + OGLTextureView texture_view; + texture_view.Create(); + + const GLuint handle{texture_view.handle}; + const FormatTuple& tuple{ + GetFormatTuple(owner_params.pixel_format, owner_params.component_type)}; + + glTextureView(handle, target, surface.texture.handle, tuple.internal_format, params.base_level, + params.num_levels, params.base_layer, params.num_layers); + + ApplyTextureDefaults(owner_params, handle); + + return texture_view; +} + +TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, + VideoCore::RasterizerInterface& rasterizer, + const Device& device) + : TextureCacheBase{system, rasterizer} { + src_framebuffer.Create(); + dst_framebuffer.Create(); +} + +TextureCacheOpenGL::~TextureCacheOpenGL() = default; + +Surface TextureCacheOpenGL::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { + return std::make_shared<CachedSurface>(gpu_addr, params); +} + +void TextureCacheOpenGL::ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) { + const auto& src_params = src_surface->GetSurfaceParams(); + const auto& dst_params = dst_surface->GetSurfaceParams(); + if (src_params.type != dst_params.type) { + // A fallback is needed + return; + } + const auto src_handle = src_surface->GetTexture(); + const auto src_target = src_surface->GetTarget(); + const auto dst_handle = dst_surface->GetTexture(); + const auto dst_target = dst_surface->GetTarget(); + glCopyImageSubData(src_handle, src_target, copy_params.source_level, copy_params.source_x, + copy_params.source_y, copy_params.source_z, dst_handle, dst_target, + copy_params.dest_level, copy_params.dest_x, copy_params.dest_y, + copy_params.dest_z, copy_params.width, copy_params.height, + copy_params.depth); +} + +void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) { + const auto& src_params{src_view->GetSurfaceParams()}; + const auto& dst_params{dst_view->GetSurfaceParams()}; + + OpenGLState prev_state{OpenGLState::GetCurState()}; + SCOPE_EXIT({ + prev_state.AllDirty(); + prev_state.Apply(); + }); + + OpenGLState state; + state.draw.read_framebuffer = src_framebuffer.handle; + state.draw.draw_framebuffer = dst_framebuffer.handle; + state.AllDirty(); + state.Apply(); + + u32 buffers{}; + + UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + + if (src_params.type == SurfaceType::ColorTexture) { + src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER); + glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + + dst_view->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + + buffers = GL_COLOR_BUFFER_BIT; + } else if (src_params.type == SurfaceType::Depth) { + glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + src_view->Attach(GL_DEPTH_ATTACHMENT, GL_READ_FRAMEBUFFER); + glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + dst_view->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + + buffers = GL_DEPTH_BUFFER_BIT; + } else if (src_params.type == SurfaceType::DepthStencil) { + glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + src_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_READ_FRAMEBUFFER); + + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + dst_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + + buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; + } + + const Common::Rectangle<u32>& src_rect = copy_config.src_rect; + const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect; + const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; + + glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left, + dst_rect.top, dst_rect.right, dst_rect.bottom, buffers, + is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST); +} + +void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) { + MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy); + const auto& src_params = src_surface->GetSurfaceParams(); + const auto& dst_params = dst_surface->GetSurfaceParams(); + UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1); + + const auto source_format = GetFormatTuple(src_params.pixel_format, src_params.component_type); + const auto dest_format = GetFormatTuple(dst_params.pixel_format, dst_params.component_type); + + const std::size_t source_size = src_surface->GetHostSizeInBytes(); + const std::size_t dest_size = dst_surface->GetHostSizeInBytes(); + + const std::size_t buffer_size = std::max(source_size, dest_size); + + GLuint copy_pbo_handle = FetchPBO(buffer_size); + + glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); + + if (source_format.compressed) { + glGetCompressedTextureImage(src_surface->GetTexture(), 0, static_cast<GLsizei>(source_size), + nullptr); + } else { + glGetTextureImage(src_surface->GetTexture(), 0, source_format.format, source_format.type, + static_cast<GLsizei>(source_size), nullptr); + } + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo_handle); + + const GLsizei width = static_cast<GLsizei>(dst_params.width); + const GLsizei height = static_cast<GLsizei>(dst_params.height); + const GLsizei depth = static_cast<GLsizei>(dst_params.depth); + if (dest_format.compressed) { + LOG_CRITICAL(HW_GPU, "Compressed buffer copy is unimplemented!"); + UNREACHABLE(); + } else { + switch (dst_params.target) { + case SurfaceTarget::Texture1D: + glTextureSubImage1D(dst_surface->GetTexture(), 0, 0, width, dest_format.format, + dest_format.type, nullptr); + break; + case SurfaceTarget::Texture2D: + glTextureSubImage2D(dst_surface->GetTexture(), 0, 0, 0, width, height, + dest_format.format, dest_format.type, nullptr); + break; + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, + dest_format.format, dest_format.type, nullptr); + break; + case SurfaceTarget::TextureCubemap: + glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, + dest_format.format, dest_format.type, nullptr); + break; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", + static_cast<u32>(dst_params.target)); + UNREACHABLE(); + } + } + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glTextureBarrier(); +} + +GLuint TextureCacheOpenGL::FetchPBO(std::size_t buffer_size) { + ASSERT_OR_EXECUTE(buffer_size > 0, { return 0; }); + const u32 l2 = Common::Log2Ceil64(static_cast<u64>(buffer_size)); + OGLBuffer& cp = copy_pbo_cache[l2]; + if (cp.handle == 0) { + const std::size_t ceil_size = 1ULL << l2; + cp.Create(); + cp.MakeStreamCopy(ceil_size); + } + return cp.handle; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h new file mode 100644 index 000000000..8e13ab38b --- /dev/null +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -0,0 +1,147 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <functional> +#include <memory> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "video_core/engines/shader_bytecode.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/texture_cache/texture_cache.h" + +namespace OpenGL { + +using VideoCommon::SurfaceParams; +using VideoCommon::ViewParams; + +class CachedSurfaceView; +class CachedSurface; +class TextureCacheOpenGL; + +using Surface = std::shared_ptr<CachedSurface>; +using View = std::shared_ptr<CachedSurfaceView>; +using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; + +class CachedSurface final : public VideoCommon::SurfaceBase<View> { + friend CachedSurfaceView; + +public: + explicit CachedSurface(GPUVAddr gpu_addr, const SurfaceParams& params); + ~CachedSurface(); + + void UploadTexture(const std::vector<u8>& staging_buffer) override; + void DownloadTexture(std::vector<u8>& staging_buffer) override; + + GLenum GetTarget() const { + return target; + } + + GLuint GetTexture() const { + return texture.handle; + } + +protected: + void DecorateSurfaceName() override; + + View CreateView(const ViewParams& view_key) override; + View CreateViewInner(const ViewParams& view_key, bool is_proxy); + +private: + void UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer); + + GLenum internal_format{}; + GLenum format{}; + GLenum type{}; + bool is_compressed{}; + GLenum target{}; + u32 view_count{}; + + OGLTexture texture; + OGLBuffer texture_buffer; +}; + +class CachedSurfaceView final : public VideoCommon::ViewBase { +public: + explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); + ~CachedSurfaceView(); + + /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER + void Attach(GLenum attachment, GLenum target) const; + + void ApplySwizzle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + void DecorateViewName(GPUVAddr gpu_addr, std::string prefix); + + void MarkAsModified(u64 tick) { + surface.MarkAsModified(true, tick); + } + + GLuint GetTexture() const { + if (is_proxy) { + return surface.GetTexture(); + } + return texture_view.handle; + } + + const SurfaceParams& GetSurfaceParams() const { + return surface.GetSurfaceParams(); + } + +private: + u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source) const { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); + } + + OGLTextureView CreateTextureView() const; + + CachedSurface& surface; + GLenum target{}; + + OGLTextureView texture_view; + u32 swizzle{}; + bool is_proxy{}; +}; + +class TextureCacheOpenGL final : public TextureCacheBase { +public: + explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const Device& device); + ~TextureCacheOpenGL(); + +protected: + Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; + + void ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) override; + + void ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) override; + + void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + +private: + GLuint FetchPBO(std::size_t buffer_size); + + OGLFramebuffer src_framebuffer; + OGLFramebuffer dst_framebuffer; + std::unordered_map<u32, OGLBuffer> copy_pbo_cache; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index aafd6f31b..839178152 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -101,21 +101,19 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst RendererOpenGL::~RendererOpenGL() = default; -/// Swap buffers (render frame) -void RendererOpenGL::SwapBuffers( - std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { - +void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { system.GetPerfStats().EndSystemFrame(); // Maintain the rasterizer's state as a priority OpenGLState prev_state = OpenGLState::GetCurState(); + state.AllDirty(); state.Apply(); if (framebuffer) { // If framebuffer is provided, reload it from memory to a texture - if (screen_info.texture.width != (GLsizei)framebuffer->get().width || - screen_info.texture.height != (GLsizei)framebuffer->get().height || - screen_info.texture.pixel_format != framebuffer->get().pixel_format) { + if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || + screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || + screen_info.texture.pixel_format != framebuffer->pixel_format) { // Reallocate texture if the framebuffer size has changed. // This is expected to not happen very often and hence should not be a // performance problem. @@ -130,6 +128,8 @@ void RendererOpenGL::SwapBuffers( DrawScreen(render_window.GetFramebufferLayout()); + rasterizer->TickFrame(); + render_window.SwapBuffers(); } @@ -139,6 +139,7 @@ void RendererOpenGL::SwapBuffers( system.GetPerfStats().BeginSystemFrame(); // Restore the rasterizer state + prev_state.AllDirty(); prev_state.Apply(); } @@ -146,43 +147,43 @@ void RendererOpenGL::SwapBuffers( * Loads framebuffer from emulated memory into the active OpenGL texture. */ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) { - const u32 bytes_per_pixel{Tegra::FramebufferConfig::BytesPerPixel(framebuffer.pixel_format)}; - const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; - const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; - // Framebuffer orientation handling framebuffer_transform_flags = framebuffer.transform_flags; framebuffer_crop_rect = framebuffer.crop_rect; - // Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default - // only allows rows to have a memory alignement of 4. - ASSERT(framebuffer.stride % 4 == 0); - - if (!rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { - // Reset the screen info's display texture to its own permanent texture - screen_info.display_texture = screen_info.texture.resource.handle; - - rasterizer->FlushRegion(ToCacheAddr(Memory::GetPointer(framebuffer_addr)), size_in_bytes); - - constexpr u32 linear_bpp = 4; - VideoCore::MortonCopyPixels128(VideoCore::MortonSwizzleMode::MortonToLinear, - framebuffer.width, framebuffer.height, bytes_per_pixel, - linear_bpp, Memory::GetPointer(framebuffer_addr), - gl_framebuffer_data.data()); - - glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride)); + const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; + if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { + return; + } - // Update existing texture - // TODO: Test what happens on hardware when you change the framebuffer dimensions so that - // they differ from the LCD resolution. - // TODO: Applications could theoretically crash yuzu here by specifying too large - // framebuffer sizes. We should make sure that this cannot happen. - glTextureSubImage2D(screen_info.texture.resource.handle, 0, 0, 0, framebuffer.width, - framebuffer.height, screen_info.texture.gl_format, - screen_info.texture.gl_type, gl_framebuffer_data.data()); + // Reset the screen info's display texture to its own permanent texture + screen_info.display_texture = screen_info.texture.resource.handle; - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); - } + const auto pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; + const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; + const auto host_ptr{Memory::GetPointer(framebuffer_addr)}; + rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes); + + // TODO(Rodrigo): Read this from HLE + constexpr u32 block_height_log2 = 4; + VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, + framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, + gl_framebuffer_data.data(), host_ptr); + + glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride)); + + // Update existing texture + // TODO: Test what happens on hardware when you change the framebuffer dimensions so that + // they differ from the LCD resolution. + // TODO: Applications could theoretically crash yuzu here by specifying too large + // framebuffer sizes. We should make sure that this cannot happen. + glTextureSubImage2D(screen_info.texture.resource.handle, 0, 0, 0, framebuffer.width, + framebuffer.height, screen_info.texture.gl_format, + screen_info.texture.gl_type, gl_framebuffer_data.data()); + + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); } /** @@ -205,6 +206,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Link shaders and get variable locations shader.CreateFromSource(vertex_shader, nullptr, fragment_shader); state.draw.shader_program = shader.handle; + state.AllDirty(); state.Apply(); uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix"); uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture"); @@ -262,7 +264,6 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - // Initialize sRGB Usage OpenGLState::ClearsRGBUsed(); rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info); } @@ -273,22 +274,29 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, texture.height = framebuffer.height; texture.pixel_format = framebuffer.pixel_format; + const auto pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; + gl_framebuffer_data.resize(texture.width * texture.height * bytes_per_pixel); + GLint internal_format; switch (framebuffer.pixel_format) { case Tegra::FramebufferConfig::PixelFormat::ABGR8: internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; - gl_framebuffer_data.resize(texture.width * texture.height * 4); + break; + case Tegra::FramebufferConfig::PixelFormat::RGB565: + internal_format = GL_RGB565; + texture.gl_format = GL_RGB; + texture.gl_type = GL_UNSIGNED_SHORT_5_6_5; break; default: internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; - gl_framebuffer_data.resize(texture.width * texture.height * 4); - LOG_CRITICAL(Render_OpenGL, "Unknown framebuffer pixel format: {}", - static_cast<u32>(framebuffer.pixel_format)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", + static_cast<u32>(framebuffer.pixel_format)); } texture.resource.Release(); @@ -334,16 +342,18 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v), }}; - state.texture_units[0].texture = screen_info.display_texture; + state.textures[0] = screen_info.display_texture; // Workaround brigthness problems in SMO by enabling sRGB in the final output // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987 state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed(); + state.AllDirty(); state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data()); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); // Restore default state state.framebuffer_srgb.enabled = false; - state.texture_units[0].texture = 0; + state.textures[0] = 0; + state.AllDirty(); state.Apply(); // Clear sRGB state for the next frame OpenGLState::ClearsRGBUsed(); @@ -388,6 +398,7 @@ void RendererOpenGL::CaptureScreenshot() { GLuint old_read_fb = state.draw.read_framebuffer; GLuint old_draw_fb = state.draw.draw_framebuffer; state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; + state.AllDirty(); state.Apply(); Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -407,6 +418,7 @@ void RendererOpenGL::CaptureScreenshot() { screenshot_framebuffer.Release(); state.draw.read_framebuffer = old_read_fb; state.draw.draw_framebuffer = old_draw_fb; + state.AllDirty(); state.Apply(); glDeleteRenderbuffers(1, &renderbuffer); @@ -471,7 +483,6 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum } } -/// Initialize the renderer bool RendererOpenGL::Init() { Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window}; diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 4aebf2321..9bd086368 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -43,14 +43,13 @@ struct ScreenInfo { TextureInfo texture; }; -class RendererOpenGL : public VideoCore::RendererBase { +class RendererOpenGL final : public VideoCore::RendererBase { public: explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system); ~RendererOpenGL() override; /// Swap buffers (render frame) - void SwapBuffers( - std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; /// Initialize the renderer bool Init() override; diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index f23fc9f9d..c504a2c1a 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -5,35 +5,75 @@ #include <string> #include <fmt/format.h> #include <glad/glad.h> + #include "common/assert.h" #include "common/common_types.h" +#include "common/scope_exit.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { +VertexArrayPushBuffer::VertexArrayPushBuffer() = default; + +VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; + +void VertexArrayPushBuffer::Setup(GLuint vao_) { + vao = vao_; + index_buffer = nullptr; + vertex_buffers.clear(); +} + +void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) { + index_buffer = buffer; +} + +void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer, + GLintptr offset, GLsizei stride) { + vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride}); +} + +void VertexArrayPushBuffer::Bind() { + if (index_buffer) { + glVertexArrayElementBuffer(vao, *index_buffer); + } + + // TODO(Rodrigo): Find a way to ARB_multi_bind this + for (const auto& entry : vertex_buffers) { + glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset, + entry.stride); + } +} + BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; void BindBuffersRangePushBuffer::Setup(GLuint first_) { first = first_; - buffers.clear(); + buffer_pointers.clear(); offsets.clear(); sizes.clear(); } -void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) { - buffers.push_back(buffer); +void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) { + buffer_pointers.push_back(buffer); offsets.push_back(offset); sizes.push_back(size); } -void BindBuffersRangePushBuffer::Bind() const { - const std::size_t count{buffers.size()}; +void BindBuffersRangePushBuffer::Bind() { + // Ensure sizes are valid. + const std::size_t count{buffer_pointers.size()}; DEBUG_ASSERT(count == offsets.size() && count == sizes.size()); if (count == 0) { return; } + + // Dereference buffers. + buffers.resize(count); + std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(), + [](const GLuint* pointer) { return *pointer; }); + glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(), sizes.data()); } @@ -63,4 +103,4 @@ void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_vie glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str())); } -} // namespace OpenGL
\ No newline at end of file +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index b3e9fc499..6c2b45546 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,20 +11,49 @@ namespace OpenGL { -class BindBuffersRangePushBuffer { +class VertexArrayPushBuffer final { public: - BindBuffersRangePushBuffer(GLenum target); + explicit VertexArrayPushBuffer(); + ~VertexArrayPushBuffer(); + + void Setup(GLuint vao_); + + void SetIndexBuffer(const GLuint* buffer); + + void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset, + GLsizei stride); + + void Bind(); + +private: + struct Entry { + GLuint binding_index{}; + const GLuint* buffer{}; + GLintptr offset{}; + GLsizei stride{}; + }; + + GLuint vao{}; + const GLuint* index_buffer{}; + std::vector<Entry> vertex_buffers; +}; + +class BindBuffersRangePushBuffer final { +public: + explicit BindBuffersRangePushBuffer(GLenum target); ~BindBuffersRangePushBuffer(); void Setup(GLuint first_); - void Push(GLuint buffer, GLintptr offset, GLsizeiptr size); + void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size); - void Bind() const; + void Bind(); private: - GLenum target; - GLuint first; + GLenum target{}; + GLuint first{}; + std::vector<const GLuint*> buffer_pointers; + std::vector<GLuint> buffers; std::vector<GLintptr> offsets; std::vector<GLsizeiptr> sizes; @@ -32,4 +61,4 @@ private: void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {}); -} // namespace OpenGL
\ No newline at end of file +} // namespace OpenGL |
