diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
| -rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.h | 9 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 88 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 9 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer_cache.cpp | 394 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer_cache.h | 110 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 34 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.h | 57 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 461 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_gen.cpp | 84 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_gen.h | 6 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_manager.cpp | 8 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_manager.h | 7 |
12 files changed, 945 insertions, 322 deletions
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 965976334..be29dc8be 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -15,15 +15,18 @@ namespace OpenGL { -struct CachedBufferEntry final { - VAddr GetAddr() const { +struct CachedBufferEntry final : public RasterizerCacheObject { + VAddr GetAddr() const override { return addr; } - std::size_t GetSizeInBytes() const { + std::size_t GetSizeInBytes() const override { return size; } + // We do not have to flush this cache as things in it are never modified by us. + void Flush() override {} + VAddr addr; std::size_t size; GLintptr offset; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 209bdf181..468253033 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -255,7 +255,7 @@ DrawParameters RasterizerOpenGL::SetupDraw() { return params; } -void RasterizerOpenGL::SetupShaders() { +void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); @@ -270,6 +270,11 @@ void RasterizerOpenGL::SetupShaders() { // Skip stages that are not enabled if (!gpu.regs.IsShaderConfigEnabled(index)) { + switch (program) { + case Maxwell::ShaderProgram::Geometry: + shader_program_manager->UseTrivialGeometryShader(); + break; + } continue; } @@ -281,18 +286,26 @@ void RasterizerOpenGL::SetupShaders() { &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); // Bind the buffer - glBindBufferRange(GL_UNIFORM_BUFFER, stage, buffer_cache.GetHandle(), offset, sizeof(ubo)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(stage), buffer_cache.GetHandle(), + offset, static_cast<GLsizeiptr>(sizeof(ubo))); Shader shader{shader_cache.GetStageProgram(program)}; switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: { - shader_program_manager->UseProgrammableVertexShader(shader->GetProgramHandle()); + shader_program_manager->UseProgrammableVertexShader( + shader->GetProgramHandle(primitive_mode)); + break; + } + case Maxwell::ShaderProgram::Geometry: { + shader_program_manager->UseProgrammableGeometryShader( + shader->GetProgramHandle(primitive_mode)); break; } case Maxwell::ShaderProgram::Fragment: { - shader_program_manager->UseProgrammableFragmentShader(shader->GetProgramHandle()); + shader_program_manager->UseProgrammableFragmentShader( + shader->GetProgramHandle(primitive_mode)); break; } default: @@ -302,12 +315,13 @@ void RasterizerOpenGL::SetupShaders() { } // Configure the const buffers for this shader stage. - current_constbuffer_bindpoint = SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), - shader, current_constbuffer_bindpoint); + current_constbuffer_bindpoint = + SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), shader, primitive_mode, + current_constbuffer_bindpoint); // Configure the textures for this shader stage. current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader, - current_texture_bindpoint); + primitive_mode, current_texture_bindpoint); // When VertexA is enabled, we have dual vertex shaders if (program == Maxwell::ShaderProgram::VertexA) { @@ -317,8 +331,6 @@ void RasterizerOpenGL::SetupShaders() { } state.Apply(); - - shader_program_manager->UseTrivialGeometryShader(); } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -412,6 +424,13 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep // Used when just a single color attachment is enabled, e.g. for clearing a color buffer Surface color_surface = res_cache.GetColorBufferSurface(*single_color_target, preserve_contents); + + if (color_surface) { + // Assume that a surface will be written to if it is used as a framebuffer, even if + // the shader doesn't actually write to it. + color_surface->MarkAsModified(true, res_cache); + } + glFramebufferTexture2D( GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target), GL_TEXTURE_2D, @@ -422,6 +441,13 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep std::array<GLenum, Maxwell::NumRenderTargets> buffers; for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents); + + if (color_surface) { + // Assume that a surface will be written to if it is used as a framebuffer, even + // if the shader doesn't actually write to it. + color_surface->MarkAsModified(true, res_cache); + } + buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); glFramebufferTexture2D( GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), @@ -441,6 +467,10 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep } if (depth_surface) { + // Assume that a surface will be written to if it is used as a framebuffer, even if + // the shader doesn't actually write to it. + depth_surface->MarkAsModified(true, res_cache); + if (regs.stencil_enable) { // Attach both depth and stencil glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, @@ -541,6 +571,7 @@ void RasterizerOpenGL::DrawArrays() { SyncLogicOpState(); SyncCullMode(); SyncAlphaTest(); + SyncScissorTest(); SyncTransformFeedback(); SyncPointState(); @@ -580,7 +611,7 @@ void RasterizerOpenGL::DrawArrays() { SetupVertexArrays(); DrawParameters params = SetupDraw(); - SetupShaders(); + SetupShaders(params.primitive_mode); buffer_cache.Unmap(); @@ -604,7 +635,14 @@ void RasterizerOpenGL::DrawArrays() { void RasterizerOpenGL::FlushAll() {} -void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {} +void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + + if (Settings::values.use_accurate_gpu_emulation) { + // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit + res_cache.FlushRegion(addr, size); + } +} void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); @@ -614,6 +652,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { } void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { + FlushRegion(addr, size); InvalidateRegion(addr, size); } @@ -719,7 +758,7 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr } u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader, - u32 current_bindpoint) { + GLenum primitive_mode, u32 current_bindpoint) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& gpu = Core::System::GetInstance().GPU(); const auto& maxwell3d = gpu.Maxwell3D(); @@ -771,7 +810,7 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shad buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); // Now configure the bindpoint of the buffer inside the shader - glUniformBlockBinding(shader->GetProgramHandle(), + glUniformBlockBinding(shader->GetProgramHandle(primitive_mode), shader->GetProgramResourceIndex(used_buffer), current_bindpoint + bindpoint); @@ -787,7 +826,8 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shad return current_bindpoint + static_cast<u32>(entries.size()); } -u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, u32 current_unit) { +u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, + GLenum primitive_mode, u32 current_unit) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& gpu = Core::System::GetInstance().GPU(); const auto& maxwell3d = gpu.Maxwell3D(); @@ -802,8 +842,8 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, // Bind the uniform to the sampler. - glProgramUniform1i(shader->GetProgramHandle(), shader->GetUniformLocation(entry), - current_bindpoint); + glProgramUniform1i(shader->GetProgramHandle(primitive_mode), + shader->GetUniformLocation(entry), current_bindpoint); const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset()); @@ -972,6 +1012,22 @@ void RasterizerOpenGL::SyncAlphaTest() { } } +void RasterizerOpenGL::SyncScissorTest() { + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + + state.scissor.enabled = (regs.scissor_test.enable != 0); + // TODO(Blinkhawk): Figure if the hardware supports scissor testing per viewport and how it's + // implemented. + if (regs.scissor_test.enable != 0) { + const u32 width = regs.scissor_test.max_x - regs.scissor_test.min_x; + const u32 height = regs.scissor_test.max_y - regs.scissor_test.min_y; + state.scissor.x = regs.scissor_test.min_x; + state.scissor.y = regs.scissor_test.min_y; + state.scissor.width = width; + state.scissor.height = height; + } +} + void RasterizerOpenGL::SyncTransformFeedback() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 0dab2018b..b1f7ccc7e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -120,7 +120,7 @@ private: * @returns The next available bindpoint for use in the next shader stage. */ u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader, - u32 current_bindpoint); + GLenum primitive_mode, u32 current_bindpoint); /* * Configures the current textures to use for the draw command. @@ -130,7 +130,7 @@ private: * @returns The next available bindpoint for use in the next shader stage. */ u32 SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader, - u32 current_unit); + GLenum primitive_mode, u32 current_unit); /// Syncs the viewport to match the guest state void SyncViewport(); @@ -165,6 +165,9 @@ private: /// Syncs the alpha test state to match the guest state void SyncAlphaTest(); + /// Syncs the scissor test state to match the guest state + void SyncScissorTest(); + /// Syncs the transform feedback state to match the guest state void SyncTransformFeedback(); @@ -207,7 +210,7 @@ private: DrawParameters SetupDraw(); - void SetupShaders(); + void SetupShaders(GLenum primitive_mode); enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 56ff83eff..1cb77aaf2 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -34,18 +34,57 @@ struct FormatTuple { bool compressed; }; -static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { - auto& gpu{Core::System::GetInstance().GPU()}; - const auto cpu_addr{gpu.MemoryManager().GpuToCpuAddress(gpu_addr)}; - return cpu_addr ? *cpu_addr : 0; +static bool IsPixelFormatASTC(PixelFormat format) { + switch (format) { + case PixelFormat::ASTC_2D_4X4: + case PixelFormat::ASTC_2D_5X4: + case PixelFormat::ASTC_2D_8X8: + case PixelFormat::ASTC_2D_8X5: + return true; + default: + return false; + } +} + +static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { + switch (format) { + case PixelFormat::ASTC_2D_4X4: + return {4, 4}; + case PixelFormat::ASTC_2D_5X4: + return {5, 4}; + case PixelFormat::ASTC_2D_8X8: + return {8, 8}; + case PixelFormat::ASTC_2D_8X5: + return {8, 5}; + default: + LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format)); + UNREACHABLE(); + } +} + +void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) { + auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; + const auto cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr_)}; + + addr = cpu_addr ? *cpu_addr : 0; + gpu_addr = gpu_addr_; + size_in_bytes = SizeInBytesRaw(); + + if (IsPixelFormatASTC(pixel_format)) { + // ASTC is uncompressed in software, in emulated as RGBA8 + size_in_bytes_gl = width * height * depth * 4; + } else { + size_in_bytes_gl = SizeInBytesGL(); + } } /*static*/ SurfaceParams SurfaceParams::CreateForTexture( const Tegra::Texture::FullTextureInfo& config, const GLShader::SamplerEntry& entry) { SurfaceParams params{}; - params.addr = TryGetCpuAddr(config.tic.Address()); params.is_tiled = config.tic.IsTiled(); + params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0, params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0, + params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0, params.pixel_format = PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value()); params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value()); @@ -85,20 +124,23 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { break; } - params.size_in_bytes_total = params.SizeInBytesTotal(); - params.size_in_bytes_2d = params.SizeInBytes2D(); params.max_mip_level = config.tic.max_mip_level + 1; params.rt = {}; + params.InitCacheParameters(config.tic.Address()); + return params; } /*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(std::size_t index) { const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]}; SurfaceParams params{}; - params.addr = TryGetCpuAddr(config.Address()); - params.is_tiled = true; - params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight; + + params.is_tiled = + config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; + params.block_width = 1 << config.memory_layout.block_width; + params.block_height = 1 << config.memory_layout.block_height; + params.block_depth = 1 << config.memory_layout.block_depth; params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); params.component_type = ComponentTypeFromRenderTarget(config.format); params.type = GetFormatType(params.pixel_format); @@ -107,8 +149,6 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.unaligned_height = config.height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.size_in_bytes_total = params.SizeInBytesTotal(); - params.size_in_bytes_2d = params.SizeInBytes2D(); params.max_mip_level = 0; // Render target specific parameters, not used for caching @@ -117,16 +157,21 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.rt.layer_stride = config.layer_stride; params.rt.base_layer = config.base_layer; + params.InitCacheParameters(config.Address()); + return params; } -/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(u32 zeta_width, u32 zeta_height, - Tegra::GPUVAddr zeta_address, - Tegra::DepthFormat format) { +/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer( + u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format, + u32 block_width, u32 block_height, u32 block_depth, + Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) { SurfaceParams params{}; - params.addr = TryGetCpuAddr(zeta_address); - params.is_tiled = true; - params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight; + + params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; + params.block_width = 1 << std::min(block_width, 5U); + params.block_height = 1 << std::min(block_height, 5U); + params.block_depth = 1 << std::min(block_depth, 5U); params.pixel_format = PixelFormatFromDepthFormat(format); params.component_type = ComponentTypeFromDepthFormat(format); params.type = GetFormatType(params.pixel_format); @@ -135,20 +180,22 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.unaligned_height = zeta_height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.size_in_bytes_total = params.SizeInBytesTotal(); - params.size_in_bytes_2d = params.SizeInBytes2D(); params.max_mip_level = 0; params.rt = {}; + params.InitCacheParameters(zeta_address); + return params; } /*static*/ SurfaceParams SurfaceParams::CreateForFermiCopySurface( const Tegra::Engines::Fermi2D::Regs::Surface& config) { SurfaceParams params{}; - params.addr = TryGetCpuAddr(config.Address()); + params.is_tiled = !config.linear; - params.block_height = params.is_tiled ? config.BlockHeight() : 0, + params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0, + params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0, + params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0, params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); params.component_type = ComponentTypeFromRenderTarget(config.format); params.type = GetFormatType(params.pixel_format); @@ -157,11 +204,11 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.unaligned_height = config.height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.size_in_bytes_total = params.SizeInBytesTotal(); - params.size_in_bytes_2d = params.SizeInBytes2D(); params.max_mip_level = 0; params.rt = {}; + params.InitCacheParameters(config.Address()); + return params; } @@ -221,6 +268,8 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RG32UI {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // R32UI {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4 // Depth formats {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F @@ -264,28 +313,6 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType return format; } -static bool IsPixelFormatASTC(PixelFormat format) { - switch (format) { - case PixelFormat::ASTC_2D_4X4: - case PixelFormat::ASTC_2D_8X8: - return true; - default: - return false; - } -} - -static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { - switch (format) { - case PixelFormat::ASTC_2D_4X4: - return {4, 4}; - case PixelFormat::ASTC_2D_8X8: - return {8, 8}; - default: - LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format)); - UNREACHABLE(); - } -} - MathUtil::Rectangle<u32> SurfaceParams::GetRect() const { u32 actual_height{unaligned_height}; if (IsPixelFormatASTC(pixel_format)) { @@ -313,29 +340,27 @@ static bool IsFormatBCn(PixelFormat format) { } template <bool morton_to_gl, PixelFormat format> -void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, std::size_t gl_buffer_size, - VAddr addr) { - constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT; - constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format); +void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth, u8* gl_buffer, + std::size_t gl_buffer_size, VAddr addr) { + constexpr u32 bytes_per_pixel = SurfaceParams::GetBytesPerPixel(format); + + // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual + // pixel values. + const u32 tile_size{IsFormatBCn(format) ? 4U : 1U}; if (morton_to_gl) { - // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual - // pixel values. - const u32 tile_size{IsFormatBCn(format) ? 4U : 1U}; const std::vector<u8> data = Tegra::Texture::UnswizzleTexture( - addr, tile_size, bytes_per_pixel, stride, height, block_height); + addr, tile_size, bytes_per_pixel, stride, height, depth, block_height, block_depth); const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())}; memcpy(gl_buffer, data.data(), size_to_copy); } else { - // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should - // check the configuration for this and perform more generic un/swizzle - LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!"); - VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel, - Memory::GetPointer(addr), gl_buffer, morton_to_gl); + Tegra::Texture::CopySwizzledData(stride / tile_size, height / tile_size, depth, + bytes_per_pixel, bytes_per_pixel, Memory::GetPointer(addr), + gl_buffer, false, block_height, block_depth); } } -static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), +static constexpr std::array<void (*)(u32, u32, u32, u32, u32, u8*, std::size_t, VAddr), SurfaceParams::MaxPixelFormat> morton_to_gl_fns = { // clang-format off @@ -385,6 +410,8 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), MortonCopy<true, PixelFormat::RG32UI>, MortonCopy<true, PixelFormat::R32UI>, MortonCopy<true, PixelFormat::ASTC_2D_8X8>, + MortonCopy<true, PixelFormat::ASTC_2D_8X5>, + MortonCopy<true, PixelFormat::ASTC_2D_5X4>, MortonCopy<true, PixelFormat::Z32F>, MortonCopy<true, PixelFormat::Z16>, MortonCopy<true, PixelFormat::Z24S8>, @@ -393,7 +420,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), // clang-format on }; -static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), +static constexpr std::array<void (*)(u32, u32, u32, u32, u32, u8*, std::size_t, VAddr), SurfaceParams::MaxPixelFormat> gl_to_morton_fns = { // clang-format off @@ -410,17 +437,16 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), MortonCopy<false, PixelFormat::RGBA16UI>, MortonCopy<false, PixelFormat::R11FG11FB10F>, MortonCopy<false, PixelFormat::RGBA32UI>, - // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/DXN2/BC7U/BC6H_UF16/BC6H_SF16/ASTC_2D_4X4 - // formats are not supported - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, + MortonCopy<false, PixelFormat::DXT1>, + MortonCopy<false, PixelFormat::DXT23>, + MortonCopy<false, PixelFormat::DXT45>, + MortonCopy<false, PixelFormat::DXN1>, + MortonCopy<false, PixelFormat::DXN2UNORM>, + MortonCopy<false, PixelFormat::DXN2SNORM>, + MortonCopy<false, PixelFormat::BC7U>, + MortonCopy<false, PixelFormat::BC6H_UF16>, + MortonCopy<false, PixelFormat::BC6H_SF16>, + // TODO(Subv): Swizzling ASTC formats are not supported nullptr, MortonCopy<false, PixelFormat::G8R8U>, MortonCopy<false, PixelFormat::G8R8S>, @@ -445,6 +471,8 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), MortonCopy<false, PixelFormat::RG32UI>, MortonCopy<false, PixelFormat::R32UI>, nullptr, + nullptr, + nullptr, MortonCopy<false, PixelFormat::Z32F>, MortonCopy<false, PixelFormat::Z16>, MortonCopy<false, PixelFormat::Z24S8>, @@ -604,22 +632,21 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface, auto source_format = GetFormatTuple(src_params.pixel_format, src_params.component_type); auto dest_format = GetFormatTuple(dst_params.pixel_format, dst_params.component_type); - std::size_t buffer_size = - std::max(src_params.size_in_bytes_total, dst_params.size_in_bytes_total); + std::size_t buffer_size = std::max(src_params.size_in_bytes, dst_params.size_in_bytes); glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW_ARB); if (source_format.compressed) { glGetCompressedTextureImage(src_surface->Texture().handle, src_attachment, - static_cast<GLsizei>(src_params.size_in_bytes_total), nullptr); + static_cast<GLsizei>(src_params.size_in_bytes), nullptr); } else { glGetTextureImage(src_surface->Texture().handle, src_attachment, source_format.format, - source_format.type, static_cast<GLsizei>(src_params.size_in_bytes_total), + source_format.type, static_cast<GLsizei>(src_params.size_in_bytes), nullptr); } // If the new texture is bigger than the previous one, we need to fill in the rest with data // from the CPU. - if (src_params.size_in_bytes_total < dst_params.size_in_bytes_total) { + if (src_params.size_in_bytes < dst_params.size_in_bytes) { // Upload the rest of the memory. if (dst_params.is_tiled) { // TODO(Subv): We might have to de-tile the subtexture and re-tile it with the rest @@ -629,12 +656,12 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface, LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during " "reinterpretation but the texture is tiled."); } - std::size_t remaining_size = - dst_params.size_in_bytes_total - src_params.size_in_bytes_total; + std::size_t remaining_size = dst_params.size_in_bytes - src_params.size_in_bytes; std::vector<u8> data(remaining_size); - Memory::ReadBlock(dst_params.addr + src_params.size_in_bytes_total, data.data(), - data.size()); - glBufferSubData(GL_PIXEL_PACK_BUFFER, src_params.size_in_bytes_total, remaining_size, + std::memcpy(data.data(), Memory::GetPointer(dst_params.addr + src_params.size_in_bytes), + data.size()); + + glBufferSubData(GL_PIXEL_PACK_BUFFER, src_params.size_in_bytes, remaining_size, data.data()); } @@ -680,7 +707,8 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface, } CachedSurface::CachedSurface(const SurfaceParams& params) - : params(params), gl_target(SurfaceTargetToGL(params.target)) { + : params(params), gl_target(SurfaceTargetToGL(params.target)), + cached_size_in_bytes(params.size_in_bytes) { texture.Create(); const auto& rect{params.GetRect()}; @@ -730,9 +758,21 @@ CachedSurface::CachedSurface(const SurfaceParams& params) VideoCore::LabelGLObject(GL_TEXTURE, texture.handle, params.addr, SurfaceParams::SurfaceTargetName(params.target)); + + // Clamp size to mapped GPU memory region + // TODO(bunnei): Super Mario Odyssey maps a 0x40000 byte region and then uses it for a 0x80000 + // R32F render buffer. We do not yet know if this is a game bug or something else, but this + // check is necessary to prevent flushing from overwriting unmapped memory. + + auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; + const u64 max_size{memory_manager.GetRegionEnd(params.gpu_addr) - params.gpu_addr}; + if (cached_size_in_bytes > max_size) { + LOG_ERROR(HW_GPU, "Surface size {} exceeds region size {}", params.size_in_bytes, max_size); + cached_size_in_bytes = max_size; + } } -static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) { +static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bool reverse) { union S8Z24 { BitField<0, 24, u32> z24; BitField<24, 8, u32> s8; @@ -745,22 +785,29 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) { }; static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size"); - S8Z24 input_pixel{}; - Z24S8 output_pixel{}; - constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)}; + S8Z24 s8z24_pixel{}; + Z24S8 z24s8_pixel{}; + constexpr auto bpp{SurfaceParams::GetBytesPerPixel(PixelFormat::S8Z24)}; for (std::size_t y = 0; y < height; ++y) { for (std::size_t x = 0; x < width; ++x) { const std::size_t offset{bpp * (y * width + x)}; - std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24)); - output_pixel.s8.Assign(input_pixel.s8); - output_pixel.z24.Assign(input_pixel.z24); - std::memcpy(&data[offset], &output_pixel, sizeof(Z24S8)); + if (reverse) { + std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8)); + s8z24_pixel.s8.Assign(z24s8_pixel.s8); + s8z24_pixel.z24.Assign(z24s8_pixel.z24); + std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24)); + } else { + std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24)); + z24s8_pixel.s8.Assign(s8z24_pixel.s8); + z24s8_pixel.z24.Assign(s8z24_pixel.z24); + std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8)); + } } } } static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) { - constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)}; + constexpr auto bpp{SurfaceParams::GetBytesPerPixel(PixelFormat::G8R8U)}; for (std::size_t y = 0; y < height; ++y) { for (std::size_t x = 0; x < width; ++x) { const std::size_t offset{bpp * (y * width + x)}; @@ -780,7 +827,9 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma u32 width, u32 height) { switch (pixel_format) { case PixelFormat::ASTC_2D_4X4: - case PixelFormat::ASTC_2D_8X8: { + case PixelFormat::ASTC_2D_8X8: + case PixelFormat::ASTC_2D_8X5: + case PixelFormat::ASTC_2D_5X4: { // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC. u32 block_width{}; u32 block_height{}; @@ -790,7 +839,7 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma } case PixelFormat::S8Z24: // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24. - ConvertS8Z24ToZ24S8(data, width, height); + ConvertS8Z24ToZ24S8(data, width, height, false); break; case PixelFormat::G8R8U: @@ -801,49 +850,54 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma } } +/** + * Helper function to perform software conversion (as needed) when flushing a buffer from OpenGL to + * Switch memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or + * with typical desktop GPUs. + */ +static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelFormat pixel_format, + u32 width, u32 height) { + switch (pixel_format) { + case PixelFormat::G8R8U: + case PixelFormat::G8R8S: + case PixelFormat::ASTC_2D_4X4: + case PixelFormat::ASTC_2D_8X8: { + LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented", + static_cast<u32>(pixel_format)); + UNREACHABLE(); + break; + } + case PixelFormat::S8Z24: + // Convert the Z24S8 depth format to S8Z24, as OpenGL does not support S8Z24. + ConvertS8Z24ToZ24S8(data, width, height, true); + break; + } +} + MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192)); void CachedSurface::LoadGLBuffer() { - ASSERT(params.type != SurfaceType::Fill); - - const u8* const texture_src_data = Memory::GetPointer(params.addr); - - ASSERT(texture_src_data); - - const u32 bytes_per_pixel = GetGLBytesPerPixel(params.pixel_format); - const u32 copy_size = params.width * params.height * bytes_per_pixel; - const std::size_t total_size = copy_size * params.depth; - MICROPROFILE_SCOPE(OpenGL_SurfaceLoad); + gl_buffer.resize(params.size_in_bytes_gl); if (params.is_tiled) { - gl_buffer.resize(total_size); + u32 depth = params.depth; + u32 block_depth = params.block_depth; - // TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do - // this for 3D textures, etc. - switch (params.target) { - case SurfaceParams::SurfaceTarget::Texture2D: - // Pass impl. to the fallback code below - break; - case SurfaceParams::SurfaceTarget::Texture2DArray: - case SurfaceParams::SurfaceTarget::TextureCubemap: - for (std::size_t index = 0; index < params.depth; ++index) { - const std::size_t offset{index * copy_size}; - morton_to_gl_fns[static_cast<std::size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, gl_buffer.data() + offset, - copy_size, params.addr + offset); - } - break; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented tiled load for target={}", - static_cast<u32>(params.target)); - UNREACHABLE(); + ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", + params.block_width, static_cast<u32>(params.target)); + + if (params.target == SurfaceParams::SurfaceTarget::Texture2D) { + // TODO(Blinkhawk): Eliminate this condition once all texture types are implemented. + depth = 1U; + block_depth = 1U; } morton_to_gl_fns[static_cast<std::size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, gl_buffer.data(), copy_size, - params.addr); + params.width, params.block_height, params.height, block_depth, depth, gl_buffer.data(), + gl_buffer.size(), params.addr); } else { - const u8* const texture_src_data_end{texture_src_data + total_size}; + const auto texture_src_data{Memory::GetPointer(params.addr)}; + const auto texture_src_data_end{texture_src_data + params.size_in_bytes_gl}; gl_buffer.assign(texture_src_data, texture_src_data_end); } @@ -852,7 +906,44 @@ void CachedSurface::LoadGLBuffer() { MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64)); void CachedSurface::FlushGLBuffer() { - ASSERT_MSG(false, "Unimplemented"); + MICROPROFILE_SCOPE(OpenGL_SurfaceFlush); + + ASSERT_MSG(!IsPixelFormatASTC(params.pixel_format), "Unimplemented"); + + // OpenGL temporary buffer needs to be big enough to store raw texture size + gl_buffer.resize(GetSizeInBytes()); + + const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); + // Ensure no bad interactions with GL_UNPACK_ALIGNMENT + ASSERT(params.width * SurfaceParams::GetBytesPerPixel(params.pixel_format) % 4 == 0); + glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width)); + ASSERT(!tuple.compressed); + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + glGetTextureImage(texture.handle, 0, tuple.format, tuple.type, gl_buffer.size(), + gl_buffer.data()); + glPixelStorei(GL_PACK_ROW_LENGTH, 0); + ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width, + params.height); + ASSERT(params.type != SurfaceType::Fill); + const u8* const texture_src_data = Memory::GetPointer(params.addr); + ASSERT(texture_src_data); + if (params.is_tiled) { + u32 depth = params.depth; + u32 block_depth = params.block_depth; + + ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", + params.block_width, static_cast<u32>(params.target)); + + if (params.target == SurfaceParams::SurfaceTarget::Texture2D) { + // TODO(Blinkhawk): Eliminate this condition once all texture types are implemented. + depth = 1U; + } + gl_to_morton_fns[static_cast<size_t>(params.pixel_format)]( + params.width, params.block_height, params.height, block_depth, depth, gl_buffer.data(), + gl_buffer.size(), GetAddr()); + } else { + std::memcpy(Memory::GetPointer(GetAddr()), gl_buffer.data(), GetSizeInBytes()); + } } MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192)); @@ -862,9 +953,6 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle MICROPROFILE_SCOPE(OpenGL_TextureUL); - ASSERT(gl_buffer.size() == static_cast<std::size_t>(params.width) * params.height * - GetGLBytesPerPixel(params.pixel_format) * params.depth); - const auto& rect{params.GetRect()}; // Load data from memory to the surface @@ -873,7 +961,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle std::size_t buffer_offset = static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.width + static_cast<std::size_t>(x0)) * - GetGLBytesPerPixel(params.pixel_format); + SurfaceParams::GetBytesPerPixel(params.pixel_format); const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); const GLuint target_tex = texture.handle; @@ -889,7 +977,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle cur_state.Apply(); // Ensure no bad interactions with GL_UNPACK_ALIGNMENT - ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0); + ASSERT(params.width * SurfaceParams::GetBytesPerPixel(params.pixel_format) % 4 == 0); glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.width)); glActiveTexture(GL_TEXTURE0); @@ -899,7 +987,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle glCompressedTexImage2D( SurfaceTargetToGL(params.target), 0, tuple.internal_format, static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.size_in_bytes_2d), &gl_buffer[buffer_offset]); + static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); break; case SurfaceParams::SurfaceTarget::Texture3D: case SurfaceParams::SurfaceTarget::Texture2DArray: @@ -907,16 +995,16 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle SurfaceTargetToGL(params.target), 0, tuple.internal_format, static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), static_cast<GLsizei>(params.depth), 0, - static_cast<GLsizei>(params.size_in_bytes_total), &gl_buffer[buffer_offset]); + static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); break; case SurfaceParams::SurfaceTarget::TextureCubemap: for (std::size_t face = 0; face < params.depth; ++face) { glCompressedTexImage2D(static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face), 0, tuple.internal_format, static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.size_in_bytes_2d), + static_cast<GLsizei>(params.SizeInBytesCubeFaceGL()), &gl_buffer[buffer_offset]); - buffer_offset += params.size_in_bytes_2d; + buffer_offset += params.SizeInBytesCubeFace(); } break; default: @@ -926,7 +1014,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle glCompressedTexImage2D( GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.size_in_bytes_2d), &gl_buffer[buffer_offset]); + static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); } } else { @@ -955,7 +1043,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle y0, static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, &gl_buffer[buffer_offset]); - buffer_offset += params.size_in_bytes_2d; + buffer_offset += params.SizeInBytesCubeFace(); } break; default: @@ -989,7 +1077,9 @@ Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) { } SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer( - regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format)}; + regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format, + regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height, + regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)}; return GetSurface(depth_params, preserve_contents); } @@ -1015,10 +1105,7 @@ Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool pre void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) { surface->LoadGLBuffer(); surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle); -} - -void RasterizerCacheOpenGL::FlushSurface(const Surface& surface) { - surface->FlushGLBuffer(); + surface->MarkAsModified(false, *this); } Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) { @@ -1087,6 +1174,14 @@ void RasterizerCacheOpenGL::FermiCopySurface( FastCopySurface(GetSurface(src_params, true), GetSurface(dst_params, false)); } +void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface, + const Surface& dst_surface) { + const auto& src_params{src_surface->GetSurfaceParams()}; + const auto& dst_params{dst_surface->GetSurfaceParams()}; + FlushRegion(src_params.addr, dst_params.size_in_bytes); + LoadSurface(dst_surface); +} + Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, const SurfaceParams& new_params) { // Verify surface is compatible for blitting @@ -1095,6 +1190,12 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, // Get a new surface with the new parameters, and blit the previous surface to it Surface new_surface{GetUncachedSurface(new_params)}; + // With use_accurate_gpu_emulation enabled, do an accurate surface copy + if (Settings::values.use_accurate_gpu_emulation) { + AccurateCopySurface(old_surface, new_surface); + return new_surface; + } + // For compatible surfaces, we can just do fast glCopyImageSubData based copy if (old_params.target == new_params.target && old_params.type == new_params.type && old_params.depth == new_params.depth && old_params.depth == 1 && @@ -1106,11 +1207,10 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, // If the format is the same, just do a framebuffer blit. This is significantly faster than // using PBOs. The is also likely less accurate, as textures will be converted rather than - // reinterpreted. When use_accurate_framebuffers setting is enabled, perform a more accurate + // reinterpreted. When use_accurate_gpu_emulation setting is enabled, perform a more accurate // surface copy, where pixels are reinterpreted as a new format (without conversion). This // code path uses OpenGL PBOs and is quite slow. - const bool is_blit{old_params.pixel_format == new_params.pixel_format || - !Settings::values.use_accurate_framebuffers}; + const bool is_blit{old_params.pixel_format == new_params.pixel_format}; switch (new_params.target) { case SurfaceParams::SurfaceTarget::Texture2D: diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 0b4940b3c..7c1cb72d0 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -18,6 +18,7 @@ #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_gen.h" +#include "video_core/textures/decoders.h" #include "video_core/textures/texture.h" namespace OpenGL { @@ -74,19 +75,21 @@ struct SurfaceParams { RG32UI = 43, R32UI = 44, ASTC_2D_8X8 = 45, + ASTC_2D_8X5 = 46, + ASTC_2D_5X4 = 47, MaxColorFormat, // Depth formats - Z32F = 46, - Z16 = 47, + Z32F = 48, + Z16 = 49, MaxDepthFormat, // DepthStencil formats - Z24S8 = 48, - S8Z24 = 49, - Z32FS8 = 50, + Z24S8 = 50, + S8Z24 = 51, + Z32FS8 = 52, MaxDepthStencilFormat, @@ -220,6 +223,8 @@ struct SurfaceParams { 1, // RG32UI 1, // R32UI 4, // ASTC_2D_8X8 + 4, // ASTC_2D_8X5 + 4, // ASTC_2D_5X4 1, // Z32F 1, // Z16 1, // Z24S8 @@ -282,6 +287,8 @@ struct SurfaceParams { 64, // RG32UI 32, // R32UI 16, // ASTC_2D_8X8 + 32, // ASTC_2D_8X5 + 32, // ASTC_2D_5X4 32, // Z32F 16, // Z16 32, // Z24S8 @@ -553,8 +560,12 @@ struct SurfaceParams { return PixelFormat::BC6H_SF16; case Tegra::Texture::TextureFormat::ASTC_2D_4X4: return PixelFormat::ASTC_2D_4X4; + case Tegra::Texture::TextureFormat::ASTC_2D_5X4: + return PixelFormat::ASTC_2D_5X4; case Tegra::Texture::TextureFormat::ASTC_2D_8X8: return PixelFormat::ASTC_2D_8X8; + case Tegra::Texture::TextureFormat::ASTC_2D_8X5: + return PixelFormat::ASTC_2D_8X5; case Tegra::Texture::TextureFormat::R16_G16: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: @@ -691,21 +702,42 @@ struct SurfaceParams { return SurfaceType::Invalid; } + /// Returns the sizer in bytes of the specified pixel format + static constexpr u32 GetBytesPerPixel(PixelFormat pixel_format) { + if (pixel_format == SurfaceParams::PixelFormat::Invalid) { + return 0; + } + return GetFormatBpp(pixel_format) / CHAR_BIT; + } + /// Returns the rectangle corresponding to this surface MathUtil::Rectangle<u32> GetRect() const; - /// Returns the size of this surface as a 2D texture in bytes, adjusted for compression - std::size_t SizeInBytes2D() const { + /// Returns the total size of this surface in bytes, adjusted for compression + std::size_t SizeInBytesRaw(bool ignore_tiled = false) const { const u32 compression_factor{GetCompressionFactor(pixel_format)}; - ASSERT(width % compression_factor == 0); - ASSERT(height % compression_factor == 0); - return (width / compression_factor) * (height / compression_factor) * - GetFormatBpp(pixel_format) / CHAR_BIT; + const u32 bytes_per_pixel{GetBytesPerPixel(pixel_format)}; + const size_t uncompressed_size{ + Tegra::Texture::CalculateSize((ignore_tiled ? false : is_tiled), bytes_per_pixel, width, + height, depth, block_height, block_depth)}; + + // Divide by compression_factor^2, as height and width are factored by this + return uncompressed_size / (compression_factor * compression_factor); } - /// Returns the total size of this surface in bytes, adjusted for compression - std::size_t SizeInBytesTotal() const { - return SizeInBytes2D() * depth; + /// Returns the size of this surface as an OpenGL texture in bytes + std::size_t SizeInBytesGL() const { + return SizeInBytesRaw(true); + } + + /// Returns the size of this surface as a cube face in bytes + std::size_t SizeInBytesCubeFace() const { + return size_in_bytes / 6; + } + + /// Returns the size of this surface as an OpenGL cube face in bytes + std::size_t SizeInBytesCubeFaceGL() const { + return size_in_bytes_gl / 6; } /// Creates SurfaceParams from a texture configuration @@ -716,9 +748,10 @@ struct SurfaceParams { static SurfaceParams CreateForFramebuffer(std::size_t index); /// Creates SurfaceParams for a depth buffer configuration - static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height, - Tegra::GPUVAddr zeta_address, - Tegra::DepthFormat format); + static SurfaceParams CreateForDepthBuffer( + u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format, + u32 block_width, u32 block_height, u32 block_depth, + Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type); /// Creates SurfaceParams for a Fermi2D surface copy static SurfaceParams CreateForFermiCopySurface( @@ -731,9 +764,13 @@ struct SurfaceParams { other.depth); } - VAddr addr; + /// Initializes parameters for caching, should be called after everything has been initialized + void InitCacheParameters(Tegra::GPUVAddr gpu_addr); + bool is_tiled; + u32 block_width; u32 block_height; + u32 block_depth; PixelFormat pixel_format; ComponentType component_type; SurfaceType type; @@ -741,11 +778,15 @@ struct SurfaceParams { u32 height; u32 depth; u32 unaligned_height; - std::size_t size_in_bytes_total; - std::size_t size_in_bytes_2d; SurfaceTarget target; u32 max_mip_level; + // Parameters used for caching + VAddr addr; + Tegra::GPUVAddr gpu_addr; + std::size_t size_in_bytes; + std::size_t size_in_bytes_gl; + // Render target specific parameters, not used in caching struct { u32 index; @@ -762,7 +803,8 @@ struct SurfaceReserveKey : Common::HashableStruct<OpenGL::SurfaceParams> { static SurfaceReserveKey Create(const OpenGL::SurfaceParams& params) { SurfaceReserveKey res; res.state = params; - res.state.rt = {}; // Ignore rt config in caching + res.state.gpu_addr = {}; // Ignore GPU vaddr in caching + res.state.rt = {}; // Ignore rt config in caching return res; } }; @@ -777,16 +819,20 @@ struct hash<SurfaceReserveKey> { namespace OpenGL { -class CachedSurface final { +class CachedSurface final : public RasterizerCacheObject { public: CachedSurface(const SurfaceParams& params); - VAddr GetAddr() const { + VAddr GetAddr() const override { return params.addr; } - std::size_t GetSizeInBytes() const { - return params.size_in_bytes_total; + std::size_t GetSizeInBytes() const override { + return cached_size_in_bytes; + } + + void Flush() override { + FlushGLBuffer(); } const OGLTexture& Texture() const { @@ -797,13 +843,6 @@ public: return gl_target; } - static constexpr unsigned int GetGLBytesPerPixel(SurfaceParams::PixelFormat format) { - if (format == SurfaceParams::PixelFormat::Invalid) - return 0; - - return SurfaceParams::GetFormatBpp(format) / CHAR_BIT; - } - const SurfaceParams& GetSurfaceParams() const { return params; } @@ -820,6 +859,7 @@ private: std::vector<u8> gl_buffer; SurfaceParams params; GLenum gl_target; + std::size_t cached_size_in_bytes; }; class RasterizerCacheOpenGL final : public RasterizerCache<Surface> { @@ -836,9 +876,6 @@ public: /// Get the color surface based on the framebuffer configuration and the specified render target Surface GetColorBufferSurface(std::size_t index, bool preserve_contents); - /// Flushes the surface to Switch memory - void FlushSurface(const Surface& surface); - /// Tries to find a framebuffer using on the provided CPU address Surface TryFindFramebufferSurface(VAddr addr) const; @@ -862,6 +899,9 @@ private: /// Tries to get a reserved surface for the specified parameters Surface TryGetReservedSurface(const SurfaceParams& params); + /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data + void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface); + /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have /// previously been used. This is to prevent surfaces from being constantly created and /// destroyed when used with different surface parameters. diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 7cd8f91e4..1a03a677f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -68,6 +68,10 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type) program_result = GLShader::GenerateVertexShader(setup); gl_type = GL_VERTEX_SHADER; break; + case Maxwell::ShaderProgram::Geometry: + program_result = GLShader::GenerateGeometryShader(setup); + gl_type = GL_GEOMETRY_SHADER; + break; case Maxwell::ShaderProgram::Fragment: program_result = GLShader::GenerateFragmentShader(setup); gl_type = GL_FRAGMENT_SHADER; @@ -80,11 +84,16 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type) entries = program_result.second; - OGLShader shader; - shader.Create(program_result.first.c_str(), gl_type); - program.Create(true, shader.handle); - SetShaderUniformBlockBindings(program.handle); - VideoCore::LabelGLObject(GL_PROGRAM, program.handle, addr); + if (program_type != Maxwell::ShaderProgram::Geometry) { + OGLShader shader; + shader.Create(program_result.first.c_str(), gl_type); + program.Create(true, shader.handle); + SetShaderUniformBlockBindings(program.handle); + VideoCore::LabelGLObject(GL_PROGRAM, program.handle, addr); + } else { + // Store shader's code to lazily build it on draw + geometry_programs.code = program_result.first; + } } GLuint CachedShader::GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer) { @@ -110,6 +119,21 @@ GLint CachedShader::GetUniformLocation(const GLShader::SamplerEntry& sampler) { return search->second; } +GLuint CachedShader::LazyGeometryProgram(OGLProgram& target_program, + const std::string& glsl_topology, + const std::string& debug_name) { + if (target_program.handle != 0) { + return target_program.handle; + } + const std::string source{geometry_programs.code + "layout (" + glsl_topology + ") in;\n"}; + OGLShader shader; + shader.Create(source.c_str(), GL_GEOMETRY_SHADER); + target_program.Create(true, shader.handle); + SetShaderUniformBlockBindings(target_program.handle); + VideoCore::LabelGLObject(GL_PROGRAM, target_program.handle, addr, debug_name); + return target_program.handle; +}; + Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const VAddr program_addr{GetShaderAddress(program)}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 9bafe43a9..a210f1731 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -7,6 +7,7 @@ #include <map> #include <memory> +#include "common/assert.h" #include "common/common_types.h" #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -18,28 +19,52 @@ class CachedShader; using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -class CachedShader final { +class CachedShader final : public RasterizerCacheObject { public: CachedShader(VAddr addr, Maxwell::ShaderProgram program_type); - /// Gets the address of the shader in guest memory, required for cache management - VAddr GetAddr() const { + VAddr GetAddr() const override { return addr; } - /// Gets the size of the shader in guest memory, required for cache management - std::size_t GetSizeInBytes() const { + std::size_t GetSizeInBytes() const override { return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64); } + // We do not have to flush this cache as things in it are never modified by us. + void Flush() override {} + /// Gets the shader entries for the shader const GLShader::ShaderEntries& GetShaderEntries() const { return entries; } /// Gets the GL program handle for the shader - GLuint GetProgramHandle() const { - return program.handle; + GLuint GetProgramHandle(GLenum primitive_mode) { + if (program_type != Maxwell::ShaderProgram::Geometry) { + return program.handle; + } + switch (primitive_mode) { + case GL_POINTS: + return LazyGeometryProgram(geometry_programs.points, "points", "ShaderPoints"); + case GL_LINES: + case GL_LINE_STRIP: + return LazyGeometryProgram(geometry_programs.lines, "lines", "ShaderLines"); + case GL_LINES_ADJACENCY: + case GL_LINE_STRIP_ADJACENCY: + return LazyGeometryProgram(geometry_programs.lines_adjacency, "lines_adjacency", + "ShaderLinesAdjacency"); + case GL_TRIANGLES: + case GL_TRIANGLE_STRIP: + case GL_TRIANGLE_FAN: + return LazyGeometryProgram(geometry_programs.triangles, "triangles", "ShaderTriangles"); + case GL_TRIANGLES_ADJACENCY: + case GL_TRIANGLE_STRIP_ADJACENCY: + return LazyGeometryProgram(geometry_programs.triangles_adjacency, "triangles_adjacency", + "ShaderLines"); + default: + UNREACHABLE_MSG("Unknown primitive mode."); + } } /// Gets the GL program resource location for the specified resource, caching as needed @@ -49,12 +74,30 @@ public: GLint GetUniformLocation(const GLShader::SamplerEntry& sampler); private: + /// Generates a geometry shader or returns one that already exists. + GLuint LazyGeometryProgram(OGLProgram& target_program, const std::string& glsl_topology, + const std::string& debug_name); + VAddr addr; Maxwell::ShaderProgram program_type; GLShader::ShaderSetup setup; GLShader::ShaderEntries entries; + + // Non-geometry program. OGLProgram program; + // Geometry programs. These are needed because GLSL needs an input topology but it's not + // declared by the hardware. Workaround this issue by generating a different shader per input + // topology class. + struct { + std::string code; + OGLProgram points; + OGLProgram lines; + OGLProgram lines_adjacency; + OGLProgram triangles; + OGLProgram triangles_adjacency; + } geometry_programs; + std::map<u32, GLuint> resource_cache; std::map<u32, GLint> uniform_cache; }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 85c668ca1..55c33c3a9 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -7,6 +7,7 @@ #include <string> #include <string_view> +#include <boost/optional.hpp> #include <fmt/format.h> #include "common/assert.h" @@ -29,11 +30,32 @@ using Tegra::Shader::SubOp; constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH; constexpr u32 PROGRAM_HEADER_SIZE = sizeof(Tegra::Shader::Header); +enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; + +constexpr u32 MAX_GEOMETRY_BUFFERS = 6; +constexpr u32 MAX_ATTRIBUTES = 0x100; // Size in vec4s, this value is untested + class DecompileFail : public std::runtime_error { public: using std::runtime_error::runtime_error; }; +/// Translate topology +static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { + switch (topology) { + case Tegra::Shader::OutputTopology::PointList: + return "points"; + case Tegra::Shader::OutputTopology::LineStrip: + return "line_strip"; + case Tegra::Shader::OutputTopology::TriangleStrip: + return "triangle_strip"; + default: + LOG_CRITICAL(Render_OpenGL, "Unknown output topology {}", static_cast<u32>(topology)); + UNREACHABLE(); + return "points"; + } +} + /// Describes the behaviour of code path of a given entry point and a return point. enum class ExitMethod { Undetermined, ///< Internal value. Only occur when analyzing JMP loop. @@ -253,8 +275,9 @@ enum class InternalFlag : u64 { class GLSLRegisterManager { public: GLSLRegisterManager(ShaderWriter& shader, ShaderWriter& declarations, - const Maxwell3D::Regs::ShaderStage& stage, const std::string& suffix) - : shader{shader}, declarations{declarations}, stage{stage}, suffix{suffix} { + const Maxwell3D::Regs::ShaderStage& stage, const std::string& suffix, + const Tegra::Shader::Header& header) + : shader{shader}, declarations{declarations}, stage{stage}, suffix{suffix}, header{header} { BuildRegisterList(); BuildInputList(); } @@ -358,11 +381,13 @@ public: * @param reg The destination register to use. * @param elem The element to use for the operation. * @param attribute The input attribute to use as the source value. + * @param vertex The register that decides which vertex to read from (used in GS). */ void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute, - const Tegra::Shader::IpaMode& input_mode) { + const Tegra::Shader::IpaMode& input_mode, + boost::optional<Register> vertex = {}) { const std::string dest = GetRegisterAsFloat(reg); - const std::string src = GetInputAttribute(attribute, input_mode) + GetSwizzle(elem); + const std::string src = GetInputAttribute(attribute, input_mode, vertex) + GetSwizzle(elem); shader.AddLine(dest + " = " + src + ';'); } @@ -391,16 +416,29 @@ public: * are stored as floats, so this may require conversion. * @param attribute The destination output attribute. * @param elem The element to use for the operation. - * @param reg The register to use as the source value. + * @param val_reg The register to use as the source value. + * @param buf_reg The register that tells which buffer to write to (used in geometry shaders). */ - void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& reg) { + void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& val_reg, + const Register& buf_reg) { const std::string dest = GetOutputAttribute(attribute); - const std::string src = GetRegisterAsFloat(reg); + const std::string src = GetRegisterAsFloat(val_reg); if (!dest.empty()) { // Can happen with unknown/unimplemented output attributes, in which case we ignore the // instruction for now. - shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';'); + if (stage == Maxwell3D::Regs::ShaderStage::Geometry) { + // TODO(Rodrigo): nouveau sets some attributes after setting emitting a geometry + // shader. These instructions use a dirty register as buffer index. To avoid some + // drivers from complaining for the out of boundary writes, guard them. + const std::string buf_index{"min(" + GetRegisterAsInteger(buf_reg) + ", " + + std::to_string(MAX_GEOMETRY_BUFFERS - 1) + ')'}; + shader.AddLine("amem[" + buf_index + "][" + + std::to_string(static_cast<u32>(attribute)) + ']' + + GetSwizzle(elem) + " = " + src + ';'); + } else { + shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';'); + } } } @@ -441,41 +479,123 @@ public: } } - /// Add declarations for registers + /// Add declarations. void GenerateDeclarations(const std::string& suffix) { + GenerateRegisters(suffix); + GenerateInternalFlags(); + GenerateInputAttrs(); + GenerateOutputAttrs(); + GenerateConstBuffers(); + GenerateSamplers(); + GenerateGeometry(); + } + + /// Returns a list of constant buffer declarations. + std::vector<ConstBufferEntry> GetConstBuffersDeclarations() const { + std::vector<ConstBufferEntry> result; + std::copy_if(declr_const_buffers.begin(), declr_const_buffers.end(), + std::back_inserter(result), [](const auto& entry) { return entry.IsUsed(); }); + return result; + } + + /// Returns a list of samplers used in the shader. + const std::vector<SamplerEntry>& GetSamplers() const { + return used_samplers; + } + + /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if + /// necessary. + std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type, + bool is_array, bool is_shadow) { + const auto offset = static_cast<std::size_t>(sampler.index.Value()); + + // If this sampler has already been used, return the existing mapping. + const auto itr = + std::find_if(used_samplers.begin(), used_samplers.end(), + [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; }); + + if (itr != used_samplers.end()) { + ASSERT(itr->GetType() == type && itr->IsArray() == is_array && + itr->IsShadow() == is_shadow); + return itr->GetName(); + } + + // Otherwise create a new mapping for this sampler + const std::size_t next_index = used_samplers.size(); + const SamplerEntry entry{stage, offset, next_index, type, is_array, is_shadow}; + used_samplers.emplace_back(entry); + return entry.GetName(); + } + +private: + /// Generates declarations for registers. + void GenerateRegisters(const std::string& suffix) { for (const auto& reg : regs) { declarations.AddLine(GLSLRegister::GetTypeString() + ' ' + reg.GetPrefixString() + std::to_string(reg.GetIndex()) + '_' + suffix + " = 0;"); } declarations.AddNewLine(); + } + /// Generates declarations for internal flags. + void GenerateInternalFlags() { for (u32 ii = 0; ii < static_cast<u64>(InternalFlag::Amount); ii++) { const InternalFlag code = static_cast<InternalFlag>(ii); declarations.AddLine("bool " + GetInternalFlag(code) + " = false;"); } declarations.AddNewLine(); + } + + /// Generates declarations for input attributes. + void GenerateInputAttrs() { + if (stage != Maxwell3D::Regs::ShaderStage::Vertex) { + const std::string attr = + stage == Maxwell3D::Regs::ShaderStage::Geometry ? "gs_position[]" : "position"; + declarations.AddLine("layout (location = " + std::to_string(POSITION_VARYING_LOCATION) + + ") in vec4 " + attr + ';'); + } for (const auto element : declr_input_attribute) { // TODO(bunnei): Use proper number of elements for these u32 idx = static_cast<u32>(element.first) - static_cast<u32>(Attribute::Index::Attribute_0); - declarations.AddLine("layout(location = " + std::to_string(idx) + ")" + - GetInputFlags(element.first) + "in vec4 " + - GetInputAttribute(element.first, element.second) + ';'); + if (stage != Maxwell3D::Regs::ShaderStage::Vertex) { + // If inputs are varyings, add an offset + idx += GENERIC_VARYING_START_LOCATION; + } + + std::string attr{GetInputAttribute(element.first, element.second)}; + if (stage == Maxwell3D::Regs::ShaderStage::Geometry) { + attr = "gs_" + attr + "[]"; + } + declarations.AddLine("layout (location = " + std::to_string(idx) + ") " + + GetInputFlags(element.first) + "in vec4 " + attr + ';'); } + declarations.AddNewLine(); + } + /// Generates declarations for output attributes. + void GenerateOutputAttrs() { + if (stage != Maxwell3D::Regs::ShaderStage::Fragment) { + declarations.AddLine("layout (location = " + std::to_string(POSITION_VARYING_LOCATION) + + ") out vec4 position;"); + } for (const auto& index : declr_output_attribute) { // TODO(bunnei): Use proper number of elements for these - declarations.AddLine("layout(location = " + - std::to_string(static_cast<u32>(index) - - static_cast<u32>(Attribute::Index::Attribute_0)) + - ") out vec4 " + GetOutputAttribute(index) + ';'); + const u32 idx = static_cast<u32>(index) - + static_cast<u32>(Attribute::Index::Attribute_0) + + GENERIC_VARYING_START_LOCATION; + declarations.AddLine("layout (location = " + std::to_string(idx) + ") out vec4 " + + GetOutputAttribute(index) + ';'); } declarations.AddNewLine(); + } + /// Generates declarations for constant buffers. + void GenerateConstBuffers() { for (const auto& entry : GetConstBuffersDeclarations()) { - declarations.AddLine("layout(std140) uniform " + entry.GetName()); + declarations.AddLine("layout (std140) uniform " + entry.GetName()); declarations.AddLine('{'); declarations.AddLine(" vec4 c" + std::to_string(entry.GetIndex()) + "[MAX_CONSTBUFFER_ELEMENTS];"); @@ -483,7 +603,10 @@ public: declarations.AddNewLine(); } declarations.AddNewLine(); + } + /// Generates declarations for samplers. + void GenerateSamplers() { const auto& samplers = GetSamplers(); for (const auto& sampler : samplers) { declarations.AddLine("uniform " + sampler.GetTypeString() + ' ' + sampler.GetName() + @@ -492,44 +615,42 @@ public: declarations.AddNewLine(); } - /// Returns a list of constant buffer declarations - std::vector<ConstBufferEntry> GetConstBuffersDeclarations() const { - std::vector<ConstBufferEntry> result; - std::copy_if(declr_const_buffers.begin(), declr_const_buffers.end(), - std::back_inserter(result), [](const auto& entry) { return entry.IsUsed(); }); - return result; - } - - /// Returns a list of samplers used in the shader - const std::vector<SamplerEntry>& GetSamplers() const { - return used_samplers; - } - - /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if - /// necessary. - std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow) { - const std::size_t offset = static_cast<std::size_t>(sampler.index.Value()); + /// Generates declarations used for geometry shaders. + void GenerateGeometry() { + if (stage != Maxwell3D::Regs::ShaderStage::Geometry) + return; - // If this sampler has already been used, return the existing mapping. - const auto itr = - std::find_if(used_samplers.begin(), used_samplers.end(), - [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; }); + declarations.AddLine( + "layout (" + GetTopologyName(header.common3.output_topology) + + ", max_vertices = " + std::to_string(header.common4.max_output_vertices) + ") out;"); + declarations.AddNewLine(); - if (itr != used_samplers.end()) { - ASSERT(itr->GetType() == type && itr->IsArray() == is_array && - itr->IsShadow() == is_shadow); - return itr->GetName(); - } + declarations.AddLine("vec4 amem[" + std::to_string(MAX_GEOMETRY_BUFFERS) + "][" + + std::to_string(MAX_ATTRIBUTES) + "];"); + declarations.AddNewLine(); - // Otherwise create a new mapping for this sampler - const std::size_t next_index = used_samplers.size(); - const SamplerEntry entry{stage, offset, next_index, type, is_array, is_shadow}; - used_samplers.emplace_back(entry); - return entry.GetName(); + constexpr char buffer[] = "amem[output_buffer]"; + declarations.AddLine("void emit_vertex(uint output_buffer) {"); + ++declarations.scope; + for (const auto element : declr_output_attribute) { + declarations.AddLine(GetOutputAttribute(element) + " = " + buffer + '[' + + std::to_string(static_cast<u32>(element)) + "];"); + } + + declarations.AddLine("position = " + std::string(buffer) + '[' + + std::to_string(static_cast<u32>(Attribute::Index::Position)) + "];"); + + // If a geometry shader is attached, it will always flip (it's the last stage before + // fragment). For more info about flipping, refer to gl_shader_gen.cpp. + declarations.AddLine("position.xy *= viewport_flip.xy;"); + declarations.AddLine("gl_Position = position;"); + declarations.AddLine("position.w = 1.0;"); + declarations.AddLine("EmitVertex();"); + --declarations.scope; + declarations.AddLine('}'); + declarations.AddNewLine(); } -private: /// Generates code representing a temporary (GPR) register. std::string GetRegister(const Register& reg, unsigned elem) { if (reg == Register::ZeroIndex) { @@ -586,11 +707,19 @@ private: /// Generates code representing an input attribute register. std::string GetInputAttribute(Attribute::Index attribute, - const Tegra::Shader::IpaMode& input_mode) { + const Tegra::Shader::IpaMode& input_mode, + boost::optional<Register> vertex = {}) { + auto GeometryPass = [&](const std::string& name) { + if (stage == Maxwell3D::Regs::ShaderStage::Geometry && vertex) { + return "gs_" + name + '[' + GetRegisterAsInteger(vertex.value(), 0, false) + ']'; + } + return name; + }; + switch (attribute) { case Attribute::Index::Position: if (stage != Maxwell3D::Regs::ShaderStage::Fragment) { - return "position"; + return GeometryPass("position"); } else { return "vec4(gl_FragCoord.x, gl_FragCoord.y, gl_FragCoord.z, 1.0)"; } @@ -619,7 +748,7 @@ private: UNREACHABLE(); } } - return "input_attribute_" + std::to_string(index); + return GeometryPass("input_attribute_" + std::to_string(index)); } LOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", static_cast<u32>(attribute)); @@ -672,7 +801,7 @@ private: return out; } - /// Generates code representing an output attribute register. + /// Generates code representing the declaration name of an output attribute register. std::string GetOutputAttribute(Attribute::Index attribute) { switch (attribute) { case Attribute::Index::Position: @@ -708,6 +837,7 @@ private: std::vector<SamplerEntry> used_samplers; const Maxwell3D::Regs::ShaderStage& stage; const std::string& suffix; + const Tegra::Shader::Header& header; }; class GLSLGenerator { @@ -1103,8 +1233,8 @@ private: return offset + 1; } - shader.AddLine("// " + std::to_string(offset) + ": " + opcode->GetName() + " (" + - std::to_string(instr.value) + ')'); + shader.AddLine( + fmt::format("// {}: {} (0x{:016x})", offset, opcode->GetName(), instr.value)); using Tegra::Shader::Pred; ASSERT_MSG(instr.pred.full_pred != Pred::NeverExecute, @@ -1306,7 +1436,6 @@ private: break; } - case OpCode::Type::Shift: { std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true); std::string op_b; @@ -1348,7 +1477,6 @@ private: } break; } - case OpCode::Type::ArithmeticIntegerImmediate: { std::string op_a = regs.GetRegisterAsInteger(instr.gpr8); std::string op_b = std::to_string(instr.alu.imm20_32.Value()); @@ -1826,7 +1954,7 @@ private: const auto LoadNextElement = [&](u32 reg_offset) { regs.SetRegisterToInputAttibute(instr.gpr0.Value() + reg_offset, next_element, static_cast<Attribute::Index>(next_index), - input_mode); + input_mode, instr.gpr39.Value()); // Load the next attribute element into the following register. If the element // to load goes beyond the vec4 size, load the first element of the next @@ -1890,8 +2018,8 @@ private: const auto StoreNextElement = [&](u32 reg_offset) { regs.SetOutputAttributeToRegister(static_cast<Attribute::Index>(next_index), - next_element, - instr.gpr0.Value() + reg_offset); + next_element, instr.gpr0.Value() + reg_offset, + instr.gpr39.Value()); // Load the next attribute element into the following register. If the element // to load goes beyond the vec4 size, load the first element of the next @@ -1908,9 +2036,9 @@ private: break; } case OpCode::Id::TEX: { - ASSERT_MSG(instr.tex.array == 0, "TEX arrays unimplemented"); Tegra::Shader::TextureType texture_type{instr.tex.texture_type}; std::string coord; + const bool is_array = instr.tex.array != 0; ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), "NODEP is not implemented"); @@ -1925,21 +2053,59 @@ private: switch (num_coordinates) { case 1: { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - coord = "float coords = " + x + ';'; + if (is_array) { + const std::string index = regs.GetRegisterAsInteger(instr.gpr8); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + index + ");"; + } else { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + coord = "float coords = " + x + ';'; + } break; } case 2: { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + if (is_array) { + const std::string index = regs.GetRegisterAsInteger(instr.gpr8); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); + coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");"; + } else { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + } break; } case 3: { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string z = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; + if (depth_compare) { + if (is_array) { + const std::string index = regs.GetRegisterAsInteger(instr.gpr8); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string y = regs.GetRegisterAsFloat(instr.gpr20); + const std::string z = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); + coord = "vec4 coords = vec4(" + x + ", " + y + ", " + z + ", " + index + + ");"; + } else { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string z = regs.GetRegisterAsFloat(instr.gpr20); + coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; + } + } else { + if (is_array) { + const std::string index = regs.GetRegisterAsInteger(instr.gpr8); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); + const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 3); + coord = "vec4 coords = vec4(" + x + ", " + y + ", " + z + ", " + index + + ");"; + } else { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); + coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; + } + } break; } default: @@ -1958,7 +2124,7 @@ private: std::string op_c; const std::string sampler = - GetSampler(instr.sampler, texture_type, false, depth_compare); + GetSampler(instr.sampler, texture_type, is_array, depth_compare); // Add an extra scope and declare the texture coords inside to prevent // overwriting them in case they are used as outputs of the texs instruction. @@ -1978,10 +2144,13 @@ private: } case Tegra::Shader::TextureProcessMode::LB: case Tegra::Shader::TextureProcessMode::LBA: { - if (num_coordinates <= 2) { - op_c = regs.GetRegisterAsFloat(instr.gpr20); + if (depth_compare) { + if (is_array) + op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 2); + else + op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); } else { - op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); + op_c = regs.GetRegisterAsFloat(instr.gpr20); } // TODO: Figure if A suffix changes the equation at all. texture = "texture(" + sampler + ", coords, " + op_c + ')'; @@ -2124,6 +2293,8 @@ private: ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::MZ), "MZ is not implemented"); + u32 op_c_offset = 0; + switch (texture_type) { case Tegra::Shader::TextureType::Texture1D: { const std::string x = regs.GetRegisterAsInteger(instr.gpr8); @@ -2138,6 +2309,7 @@ private: const std::string x = regs.GetRegisterAsInteger(instr.gpr8); const std::string y = regs.GetRegisterAsInteger(instr.gpr20); coord = "ivec2 coords = ivec2(" + x + ", " + y + ");"; + op_c_offset = 1; } break; } @@ -2149,13 +2321,14 @@ private: const std::string sampler = GetSampler(instr.sampler, texture_type, is_array, false); std::string texture = "texelFetch(" + sampler + ", coords, 0)"; - const std::string op_c = regs.GetRegisterAsInteger(instr.gpr20.Value() + 1); switch (instr.tlds.GetTextureProcessMode()) { case Tegra::Shader::TextureProcessMode::LZ: { texture = "texelFetch(" + sampler + ", coords, 0)"; break; } case Tegra::Shader::TextureProcessMode::LL: { + const std::string op_c = + regs.GetRegisterAsInteger(instr.gpr20.Value() + op_c_offset); texture = "texelFetch(" + sampler + ", coords, " + op_c + ')'; break; } @@ -2496,14 +2669,14 @@ private: const std::string pred = GetPredicateCondition(instr.csetp.pred39, instr.csetp.neg_pred39 != 0); const std::string combiner = GetPredicateCombiner(instr.csetp.op); - const std::string controlCode = regs.GetControlCode(instr.csetp.cc); + const std::string control_code = regs.GetControlCode(instr.csetp.cc); if (instr.csetp.pred3 != static_cast<u64>(Pred::UnusedIndex)) { SetPredicate(instr.csetp.pred3, - '(' + controlCode + ") " + combiner + " (" + pred + ')'); + '(' + control_code + ") " + combiner + " (" + pred + ')'); } if (instr.csetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) { SetPredicate(instr.csetp.pred0, - "!(" + controlCode + ") " + combiner + " (" + pred + ')'); + "!(" + control_code + ") " + combiner + " (" + pred + ')'); } break; } @@ -2734,6 +2907,52 @@ private: break; } + case OpCode::Id::OUT_R: { + ASSERT(instr.gpr20.Value() == Register::ZeroIndex); + ASSERT_MSG(stage == Maxwell3D::Regs::ShaderStage::Geometry, + "OUT is expected to be used in a geometry shader."); + + if (instr.out.emit) { + // gpr0 is used to store the next address. Hardware returns a pointer but + // we just return the next index with a cyclic cap. + const std::string current{regs.GetRegisterAsInteger(instr.gpr8, 0, false)}; + const std::string next = "((" + current + " + 1" + ") % " + + std::to_string(MAX_GEOMETRY_BUFFERS) + ')'; + shader.AddLine("emit_vertex(" + current + ");"); + regs.SetRegisterToInteger(instr.gpr0, false, 0, next, 1, 1); + } + if (instr.out.cut) { + shader.AddLine("EndPrimitive();"); + } + + break; + } + case OpCode::Id::MOV_SYS: { + switch (instr.sys20) { + case Tegra::Shader::SystemVariable::InvocationInfo: { + LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete"); + regs.SetRegisterToInteger(instr.gpr0, false, 0, "0u", 1, 1); + break; + } + default: { + LOG_CRITICAL(HW_GPU, "Unhandled system move: {}", + static_cast<u32>(instr.sys20.Value())); + UNREACHABLE(); + } + } + break; + } + case OpCode::Id::ISBERD: { + ASSERT(instr.isberd.o == 0); + ASSERT(instr.isberd.skew == 0); + ASSERT(instr.isberd.shift == Tegra::Shader::IsberdShift::None); + ASSERT(instr.isberd.mode == Tegra::Shader::IsberdMode::None); + ASSERT_MSG(stage == Maxwell3D::Regs::ShaderStage::Geometry, + "ISBERD is expected to be used in a geometry shader."); + LOG_WARNING(HW_GPU, "ISBERD instruction is incomplete"); + regs.SetRegisterToFloat(instr.gpr0, 0, regs.GetRegisterAsFloat(instr.gpr8), 1, 1); + break; + } case OpCode::Id::BRA: { ASSERT_MSG(instr.bra.constant_buffer == 0, "BRA with constant buffers are not implemented"); @@ -2777,6 +2996,88 @@ private: LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed"); break; } + case OpCode::Id::VMAD: { + const bool signed_a = instr.vmad.signed_a == 1; + const bool signed_b = instr.vmad.signed_b == 1; + const bool result_signed = signed_a || signed_b; + boost::optional<std::string> forced_result; + + auto Unpack = [&](const std::string& op, bool is_chunk, bool is_signed, + Tegra::Shader::VmadType type, u64 byte_height) { + const std::string value = [&]() { + if (!is_chunk) { + const auto offset = static_cast<u32>(byte_height * 8); + return "((" + op + " >> " + std::to_string(offset) + ") & 0xff)"; + } + const std::string zero = "0"; + + switch (type) { + case Tegra::Shader::VmadType::Size16_Low: + return '(' + op + " & 0xffff)"; + case Tegra::Shader::VmadType::Size16_High: + return '(' + op + " >> 16)"; + case Tegra::Shader::VmadType::Size32: + // TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when + // this type is used (1 * 1 + 0 == 0x5b800000). Until a better + // explanation is found: assert. + UNREACHABLE_MSG("Unimplemented"); + return zero; + case Tegra::Shader::VmadType::Invalid: + // Note(Rodrigo): This flag is invalid according to nvdisasm. From my + // testing (even though it's invalid) this makes the whole instruction + // assign zero to target register. + forced_result = boost::make_optional(zero); + return zero; + default: + UNREACHABLE(); + return zero; + } + }(); + + if (is_signed) { + return "int(" + value + ')'; + } + return value; + }; + + const std::string op_a = Unpack(regs.GetRegisterAsInteger(instr.gpr8, 0, false), + instr.vmad.is_byte_chunk_a != 0, signed_a, + instr.vmad.type_a, instr.vmad.byte_height_a); + + std::string op_b; + if (instr.vmad.use_register_b) { + op_b = Unpack(regs.GetRegisterAsInteger(instr.gpr20, 0, false), + instr.vmad.is_byte_chunk_b != 0, signed_b, instr.vmad.type_b, + instr.vmad.byte_height_b); + } else { + op_b = '(' + + std::to_string(signed_b ? static_cast<s16>(instr.alu.GetImm20_16()) + : instr.alu.GetImm20_16()) + + ')'; + } + + const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39, 0, result_signed); + + std::string result; + if (forced_result) { + result = *forced_result; + } else { + result = '(' + op_a + " * " + op_b + " + " + op_c + ')'; + + switch (instr.vmad.shr) { + case Tegra::Shader::VmadShr::Shr7: + result = '(' + result + " >> 7)"; + break; + case Tegra::Shader::VmadShr::Shr15: + result = '(' + result + " >> 15)"; + break; + } + } + regs.SetRegisterToInteger(instr.gpr0, result_signed, 1, result, 1, 1, + instr.vmad.saturate == 1, 0, Register::Size::Word, + instr.vmad.cc); + break; + } default: { LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName()); UNREACHABLE(); @@ -2907,7 +3208,7 @@ private: ShaderWriter shader; ShaderWriter declarations; - GLSLRegisterManager regs{shader, declarations, stage, suffix}; + GLSLRegisterManager regs{shader, declarations, stage, suffix, header}; // Declarations std::set<std::string> declr_predicates; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index b0466c18f..1e5eb32df 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -17,7 +17,18 @@ ProgramResult GenerateVertexShader(const ShaderSetup& setup) { std::string out = "#version 430 core\n"; out += "#extension GL_ARB_separate_shader_objects : enable\n\n"; out += Decompiler::GetCommonDeclarations(); - out += "bool exec_vertex();\n"; + + out += R"( +out gl_PerVertex { + vec4 gl_Position; +}; + +layout(std140) uniform vs_config { + vec4 viewport_flip; + uvec4 instance_id; + uvec4 flip_stage; +}; +)"; if (setup.IsDualProgram()) { out += "bool exec_vertex_b();\n"; @@ -28,18 +39,17 @@ ProgramResult GenerateVertexShader(const ShaderSetup& setup) { Maxwell3D::Regs::ShaderStage::Vertex, "vertex") .get_value_or({}); - out += R"( - -out gl_PerVertex { - vec4 gl_Position; -}; + out += program.first; -out vec4 position; + if (setup.IsDualProgram()) { + ProgramResult program_b = + Decompiler::DecompileProgram(setup.program.code_b, PROGRAM_OFFSET, + Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b") + .get_value_or({}); + out += program_b.first; + } -layout (std140) uniform vs_config { - vec4 viewport_flip; - uvec4 instance_id; -}; + out += R"( void main() { position = vec4(0.0, 0.0, 0.0, 0.0); @@ -52,27 +62,52 @@ void main() { out += R"( - // Viewport can be flipped, which is unsupported by glViewport - position.xy *= viewport_flip.xy; + // Check if the flip stage is VertexB + if (flip_stage[0] == 1) { + // Viewport can be flipped, which is unsupported by glViewport + position.xy *= viewport_flip.xy; + } gl_Position = position; // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0 // For now, this is here to bring order in lieu of proper emulation - position.w = 1.0; + if (flip_stage[0] == 1) { + position.w = 1.0; + } } )"; - out += program.first; + return {out, program.second}; +} - if (setup.IsDualProgram()) { - ProgramResult program_b = - Decompiler::DecompileProgram(setup.program.code_b, PROGRAM_OFFSET, - Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b") - .get_value_or({}); - out += program_b.first; - } +ProgramResult GenerateGeometryShader(const ShaderSetup& setup) { + std::string out = "#version 430 core\n"; + out += "#extension GL_ARB_separate_shader_objects : enable\n\n"; + out += Decompiler::GetCommonDeclarations(); + out += "bool exec_geometry();\n"; + + ProgramResult program = + Decompiler::DecompileProgram(setup.program.code, PROGRAM_OFFSET, + Maxwell3D::Regs::ShaderStage::Geometry, "geometry") + .get_value_or({}); + out += R"( +out gl_PerVertex { + vec4 gl_Position; +}; +layout (std140) uniform gs_config { + vec4 viewport_flip; + uvec4 instance_id; + uvec4 flip_stage; +}; + +void main() { + exec_geometry(); +} + +)"; + out += program.first; return {out, program.second}; } @@ -87,7 +122,6 @@ ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { Maxwell3D::Regs::ShaderStage::Fragment, "fragment") .get_value_or({}); out += R"( -in vec4 position; layout(location = 0) out vec4 FragColor0; layout(location = 1) out vec4 FragColor1; layout(location = 2) out vec4 FragColor2; @@ -100,6 +134,7 @@ layout(location = 7) out vec4 FragColor7; layout (std140) uniform fs_config { vec4 viewport_flip; uvec4 instance_id; + uvec4 flip_stage; }; void main() { @@ -110,5 +145,4 @@ void main() { out += program.first; return {out, program.second}; } - -} // namespace OpenGL::GLShader +} // namespace OpenGL::GLShader
\ No newline at end of file diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index e56f39e78..79596087a 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -196,6 +196,12 @@ private: ProgramResult GenerateVertexShader(const ShaderSetup& setup); /** + * Generates the GLSL geometry shader program source code for the given GS program + * @returns String of the shader source code + */ +ProgramResult GenerateGeometryShader(const ShaderSetup& setup); + +/** * Generates the GLSL fragment shader program source code for the given FS program * @returns String of the shader source code */ diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 022d32a86..010857ec6 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -18,6 +18,14 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh // We only assign the instance to the first component of the vector, the rest is just padding. instance_id[0] = state.current_instance; + + // Assign in which stage the position has to be flipped + // (the last stage before the fragment shader). + if (gpu.regs.shader_config[static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry)].enable) { + flip_stage[0] = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry); + } else { + flip_stage[0] = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB); + } } } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 3de15ba9b..b3a191cf2 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -21,8 +21,9 @@ struct MaxwellUniformData { void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage); alignas(16) GLvec4 viewport_flip; alignas(16) GLuvec4 instance_id; + alignas(16) GLuvec4 flip_stage; }; -static_assert(sizeof(MaxwellUniformData) == 32, "MaxwellUniformData structure size is incorrect"); +static_assert(sizeof(MaxwellUniformData) == 48, "MaxwellUniformData structure size is incorrect"); static_assert(sizeof(MaxwellUniformData) < 16384, "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec"); @@ -36,6 +37,10 @@ public: vs = program; } + void UseProgrammableGeometryShader(GLuint program) { + gs = program; + } + void UseProgrammableFragmentShader(GLuint program) { fs = program; } |
