diff options
Diffstat (limited to 'src/video_core')
64 files changed, 5318 insertions, 3409 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 09ecc5bad..0406fbcd9 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,6 +1,6 @@ add_library(video_core STATIC - command_processor.cpp - command_processor.h + dma_pusher.cpp + dma_pusher.h debug_utils/debug_utils.cpp debug_utils/debug_utils.h engines/fermi_2d.cpp @@ -21,6 +21,9 @@ add_library(video_core STATIC macro_interpreter.h memory_manager.cpp memory_manager.h + morton.cpp + morton.h + rasterizer_cache.cpp rasterizer_cache.h rasterizer_interface.h renderer_base.cpp @@ -33,6 +36,7 @@ add_library(video_core STATIC renderer_opengl/gl_rasterizer.h renderer_opengl/gl_rasterizer_cache.cpp renderer_opengl/gl_rasterizer_cache.h + renderer_opengl/gl_resource_manager.cpp renderer_opengl/gl_resource_manager.h renderer_opengl/gl_shader_cache.cpp renderer_opengl/gl_shader_cache.h @@ -51,12 +55,15 @@ add_library(video_core STATIC renderer_opengl/maxwell_to_gl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h + renderer_opengl/utils.cpp + renderer_opengl/utils.h + surface.cpp + surface.h textures/astc.cpp textures/astc.h textures/decoders.cpp textures/decoders.h textures/texture.h - utils.h video_core.cpp video_core.h ) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp deleted file mode 100644 index f1aa6091b..000000000 --- a/src/video_core/command_processor.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <array> -#include <cstddef> -#include <memory> -#include <utility> -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "common/vector_math.h" -#include "core/memory.h" -#include "core/tracer/recorder.h" -#include "video_core/command_processor.h" -#include "video_core/engines/fermi_2d.h" -#include "video_core/engines/kepler_memory.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/engines/maxwell_compute.h" -#include "video_core/engines/maxwell_dma.h" -#include "video_core/gpu.h" -#include "video_core/renderer_base.h" -#include "video_core/video_core.h" - -namespace Tegra { - -enum class BufferMethods { - BindObject = 0, - CountBufferMethods = 0x40, -}; - -MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); - -void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) { - MICROPROFILE_SCOPE(ProcessCommandLists); - - auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) { - LOG_TRACE(HW_GPU, - "Processing method {:08X} on subchannel {} value " - "{:08X} remaining params {}", - method, subchannel, value, remaining_params); - - ASSERT(subchannel < bound_engines.size()); - - if (method == static_cast<u32>(BufferMethods::BindObject)) { - // Bind the current subchannel to the desired engine id. - LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); - bound_engines[subchannel] = static_cast<EngineID>(value); - return; - } - - if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) { - // TODO(Subv): Research and implement these methods. - LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); - return; - } - - const EngineID engine = bound_engines[subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->WriteReg(method, value); - break; - case EngineID::MAXWELL_B: - maxwell_3d->WriteReg(method, value, remaining_params); - break; - case EngineID::MAXWELL_COMPUTE_B: - maxwell_compute->WriteReg(method, value); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->WriteReg(method, value); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->WriteReg(method, value); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } - }; - - for (auto entry : commands) { - Tegra::GPUVAddr address = entry.Address(); - u32 size = entry.sz; - const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address); - VAddr current_addr = *head_address; - while (current_addr < *head_address + size * sizeof(CommandHeader)) { - const CommandHeader header = {Memory::Read32(current_addr)}; - current_addr += sizeof(u32); - - switch (header.mode.Value()) { - case SubmissionMode::IncreasingOld: - case SubmissionMode::Increasing: { - // Increase the method value with each argument. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; - } - case SubmissionMode::NonIncreasingOld: - case SubmissionMode::NonIncreasing: { - // Use the same method value for all arguments. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; - } - case SubmissionMode::IncreaseOnce: { - ASSERT(header.arg_count.Value() >= 1); - - // Use the original method for the first argument and then the next method for all - // other arguments. - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - 1); - current_addr += sizeof(u32); - - for (unsigned i = 1; i < header.arg_count; ++i) { - WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; - } - case SubmissionMode::Inline: { - // The register value is stored in the bits 16-28 as an immediate - WriteReg(header.method, header.subchannel, header.inline_data, 0); - break; - } - default: - UNIMPLEMENTED(); - } - } - } -} - -} // namespace Tegra diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h deleted file mode 100644 index bd766e77a..000000000 --- a/src/video_core/command_processor.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> -#include "common/bit_field.h" -#include "common/common_types.h" -#include "video_core/memory_manager.h" - -namespace Tegra { - -enum class SubmissionMode : u32 { - IncreasingOld = 0, - Increasing = 1, - NonIncreasingOld = 2, - NonIncreasing = 3, - Inline = 4, - IncreaseOnce = 5 -}; - -struct CommandListHeader { - u32 entry0; // gpu_va_lo - union { - u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) - BitField<0, 8, u32> gpu_va_hi; - BitField<8, 2, u32> unk1; - BitField<10, 21, u32> sz; - BitField<31, 1, u32> unk2; - }; - - GPUVAddr Address() const { - return (static_cast<GPUVAddr>(gpu_va_hi) << 32) | entry0; - } -}; -static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size"); - -union CommandHeader { - u32 hex; - - BitField<0, 13, u32> method; - BitField<13, 3, u32> subchannel; - - BitField<16, 13, u32> arg_count; - BitField<16, 13, u32> inline_data; - - BitField<29, 3, SubmissionMode> mode; -}; -static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); -static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); - -} // namespace Tegra diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp new file mode 100644 index 000000000..63a958f11 --- /dev/null +++ b/src/video_core/dma_pusher.cpp @@ -0,0 +1,123 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/microprofile.h" +#include "core/core.h" +#include "core/memory.h" +#include "video_core/dma_pusher.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" + +namespace Tegra { + +DmaPusher::DmaPusher(GPU& gpu) : gpu(gpu) {} + +DmaPusher::~DmaPusher() = default; + +MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); + +void DmaPusher::DispatchCalls() { + MICROPROFILE_SCOPE(DispatchCalls); + + // On entering GPU code, assume all memory may be touched by the ARM core. + gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); + + dma_pushbuffer_subindex = 0; + + while (Core::System::GetInstance().IsPoweredOn()) { + if (!Step()) { + break; + } + } +} + +bool DmaPusher::Step() { + if (dma_get != dma_put) { + // Push buffer non-empty, read a word + const CommandHeader command_header{ + Memory::Read32(*gpu.MemoryManager().GpuToCpuAddress(dma_get))}; + + dma_get += sizeof(u32); + + if (!non_main) { + dma_mget = dma_get; + } + + // now, see if we're in the middle of a command + if (dma_state.length_pending) { + // Second word of long non-inc methods command - method count + dma_state.length_pending = 0; + dma_state.method_count = command_header.method_count_; + } else if (dma_state.method_count) { + // Data word of methods command + CallMethod(command_header.argument); + + if (!dma_state.non_incrementing) { + dma_state.method++; + } + + if (dma_increment_once) { + dma_state.non_incrementing = true; + } + + dma_state.method_count--; + } else { + // No command active - this is the first word of a new one + switch (command_header.mode) { + case SubmissionMode::Increasing: + SetState(command_header); + dma_state.non_incrementing = false; + dma_increment_once = false; + break; + case SubmissionMode::NonIncreasing: + SetState(command_header); + dma_state.non_incrementing = true; + dma_increment_once = false; + break; + case SubmissionMode::Inline: + dma_state.method = command_header.method; + dma_state.subchannel = command_header.subchannel; + CallMethod(command_header.arg_count); + dma_state.non_incrementing = true; + dma_increment_once = false; + break; + case SubmissionMode::IncreaseOnce: + SetState(command_header); + dma_state.non_incrementing = false; + dma_increment_once = true; + break; + } + } + } else if (ib_enable && !dma_pushbuffer.empty()) { + // Current pushbuffer empty, but we have more IB entries to read + const CommandList& command_list{dma_pushbuffer.front()}; + const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]}; + dma_get = command_list_header.addr; + dma_put = dma_get + command_list_header.size * sizeof(u32); + non_main = command_list_header.is_non_main; + + if (dma_pushbuffer_subindex >= command_list.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } + } else { + // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do + return {}; + } + + return true; +} + +void DmaPusher::SetState(const CommandHeader& command_header) { + dma_state.method = command_header.method; + dma_state.subchannel = command_header.subchannel; + dma_state.method_count = command_header.method_count; +} + +void DmaPusher::CallMethod(u32 argument) const { + gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); +} + +} // namespace Tegra diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h new file mode 100644 index 000000000..16e0697c4 --- /dev/null +++ b/src/video_core/dma_pusher.h @@ -0,0 +1,99 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include <queue> + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/memory_manager.h" + +namespace Tegra { + +enum class SubmissionMode : u32 { + IncreasingOld = 0, + Increasing = 1, + NonIncreasingOld = 2, + NonIncreasing = 3, + Inline = 4, + IncreaseOnce = 5 +}; + +struct CommandListHeader { + union { + u64 raw; + BitField<0, 40, GPUVAddr> addr; + BitField<41, 1, u64> is_non_main; + BitField<42, 21, u64> size; + }; +}; +static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size"); + +union CommandHeader { + u32 argument; + BitField<0, 13, u32> method; + BitField<0, 24, u32> method_count_; + BitField<13, 3, u32> subchannel; + BitField<16, 13, u32> arg_count; + BitField<16, 13, u32> method_count; + BitField<29, 3, SubmissionMode> mode; +}; +static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); +static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); + +class GPU; + +using CommandList = std::vector<Tegra::CommandListHeader>; + +/** + * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the + * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled + * into a "command stream" consisting of 32-bit words that make up "commands". + * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for + * details on this implementation. + */ +class DmaPusher { +public: + explicit DmaPusher(GPU& gpu); + ~DmaPusher(); + + void Push(CommandList&& entries) { + dma_pushbuffer.push(std::move(entries)); + } + + void DispatchCalls(); + +private: + bool Step(); + + void SetState(const CommandHeader& command_header); + + void CallMethod(u32 argument) const; + + GPU& gpu; + + std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed + std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer + + struct DmaState { + u32 method; ///< Current method + u32 subchannel; ///< Current subchannel + u32 method_count; ///< Current method count + u32 length_pending; ///< Large NI command length pending + bool non_incrementing; ///< Current command’s NI flag + }; + + DmaState dma_state{}; + bool dma_increment_once{}; + + GPUVAddr dma_put{}; ///< pushbuffer current end address + GPUVAddr dma_get{}; ///< pushbuffer current read address + GPUVAddr dma_mget{}; ///< main pushbuffer last read address + bool ib_enable{true}; ///< IB mode enabled + bool non_main{}; ///< non-main pushbuffer active +}; + +} // namespace Tegra diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 74e44c7fe..80f70e332 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -2,8 +2,10 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "core/core.h" #include "core/memory.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h" #include "video_core/textures/decoders.h" @@ -12,13 +14,13 @@ namespace Tegra::Engines { Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : memory_manager(memory_manager), rasterizer{rasterizer} {} -void Fermi2D::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid Fermi2D register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case FERMI2D_REG_INDEX(trigger): { HandleSurfaceCopy(); break; @@ -47,6 +49,9 @@ void Fermi2D::HandleSurfaceCopy() { u32 dst_bytes_per_pixel = RenderTargetBytesPerPixel(regs.dst.format); if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst)) { + // All copies here update the main memory, so mark all rasterizer states as invalid. + Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + rasterizer.FlushRegion(source_cpu, src_bytes_per_pixel * regs.src.width * regs.src.height); // We have to invalidate the destination region to evict any outdated surfaces from the // cache. We do this before actually writing the new data because the destination address @@ -68,13 +73,13 @@ void Fermi2D::HandleSurfaceCopy() { Texture::CopySwizzledData(regs.src.width, regs.src.height, regs.src.depth, src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer, dst_buffer, true, regs.src.BlockHeight(), - regs.src.BlockDepth()); + regs.src.BlockDepth(), 0); } else { // If the input is linear and the output is tiled, swizzle the input and copy it over. Texture::CopySwizzledData(regs.src.width, regs.src.height, regs.src.depth, src_bytes_per_pixel, dst_bytes_per_pixel, dst_buffer, src_buffer, false, regs.dst.BlockHeight(), - regs.dst.BlockDepth()); + regs.dst.BlockDepth(), 0); } } } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 2a6e8bbbb..50009bf75 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -27,7 +27,7 @@ public: ~Fermi2D() = default; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr std::size_t NUM_REGS = 0x258; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 585290d9f..4880191fc 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -3,8 +3,10 @@ // Refer to the license.txt file included. #include "common/logging/log.h" +#include "core/core.h" #include "core/memory.h" #include "video_core/engines/kepler_memory.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h" namespace Tegra::Engines { @@ -15,19 +17,19 @@ KeplerMemory::KeplerMemory(VideoCore::RasterizerInterface& rasterizer, KeplerMemory::~KeplerMemory() = default; -void KeplerMemory::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid KeplerMemory register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case KEPLERMEMORY_REG_INDEX(exec): { state.write_offset = 0; break; } case KEPLERMEMORY_REG_INDEX(data): { - ProcessData(value); + ProcessData(method_call.argument); break; } } @@ -47,6 +49,7 @@ void KeplerMemory::ProcessData(u32 data) { rasterizer.InvalidateRegion(dest_address, sizeof(u32)); Memory::Write32(dest_address, data); + Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); state.write_offset++; } diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index bf4a13cff..fe9ebc5b9 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -9,6 +9,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" namespace VideoCore { @@ -26,7 +27,7 @@ public: ~KeplerMemory(); /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr size_t NUM_REGS = 0x7F; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 27ef865a2..b19b3a75a 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -34,8 +34,49 @@ void Maxwell3D::InitializeRegisterDefaults() { // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is // needed for ARMS. for (std::size_t viewport{}; viewport < Regs::NumViewports; ++viewport) { - regs.viewport[viewport].depth_range_near = 0.0f; - regs.viewport[viewport].depth_range_far = 1.0f; + regs.viewports[viewport].depth_range_near = 0.0f; + regs.viewports[viewport].depth_range_far = 1.0f; + } + // Doom and Bomberman seems to use the uninitialized registers and just enable blend + // so initialize blend registers with sane values + regs.blend.equation_rgb = Regs::Blend::Equation::Add; + regs.blend.factor_source_rgb = Regs::Blend::Factor::One; + regs.blend.factor_dest_rgb = Regs::Blend::Factor::Zero; + regs.blend.equation_a = Regs::Blend::Equation::Add; + regs.blend.factor_source_a = Regs::Blend::Factor::One; + regs.blend.factor_dest_a = Regs::Blend::Factor::Zero; + for (std::size_t blend_index = 0; blend_index < Regs::NumRenderTargets; blend_index++) { + regs.independent_blend[blend_index].equation_rgb = Regs::Blend::Equation::Add; + regs.independent_blend[blend_index].factor_source_rgb = Regs::Blend::Factor::One; + regs.independent_blend[blend_index].factor_dest_rgb = Regs::Blend::Factor::Zero; + regs.independent_blend[blend_index].equation_a = Regs::Blend::Equation::Add; + regs.independent_blend[blend_index].factor_source_a = Regs::Blend::Factor::One; + regs.independent_blend[blend_index].factor_dest_a = Regs::Blend::Factor::Zero; + } + regs.stencil_front_op_fail = Regs::StencilOp::Keep; + regs.stencil_front_op_zfail = Regs::StencilOp::Keep; + regs.stencil_front_op_zpass = Regs::StencilOp::Keep; + regs.stencil_front_func_func = Regs::ComparisonOp::Always; + regs.stencil_front_func_mask = 0xFFFFFFFF; + regs.stencil_front_mask = 0xFFFFFFFF; + regs.stencil_two_side_enable = 1; + regs.stencil_back_op_fail = Regs::StencilOp::Keep; + regs.stencil_back_op_zfail = Regs::StencilOp::Keep; + regs.stencil_back_op_zpass = Regs::StencilOp::Keep; + regs.stencil_back_func_func = Regs::ComparisonOp::Always; + regs.stencil_back_func_mask = 0xFFFFFFFF; + regs.stencil_back_mask = 0xFFFFFFFF; + // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a + // register carrying a default value. Assume it's OpenGL's default (1). + regs.point_size = 1.0f; + + // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a + // default of enabled fixes rendering here. + for (std::size_t color_mask = 0; color_mask < Regs::NumRenderTargets; color_mask++) { + regs.color_mask[color_mask].R.Assign(1); + regs.color_mask[color_mask].G.Assign(1); + regs.color_mask[color_mask].B.Assign(1); + regs.color_mask[color_mask].A.Assign(1); } } @@ -43,58 +84,87 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) { // Reset the current macro. executing_macro = 0; - // The requested macro must have been uploaded already. - auto macro_code = uploaded_macros.find(method); - if (macro_code == uploaded_macros.end()) { - LOG_ERROR(HW_GPU, "Macro {:04X} was not uploaded", method); + // Lookup the macro offset + const u32 entry{(method - MacroRegistersStart) >> 1}; + const auto& search{macro_offsets.find(entry)}; + if (search == macro_offsets.end()) { + LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method); + UNREACHABLE(); return; } // Execute the current macro. - macro_interpreter.Execute(macro_code->second, std::move(parameters)); + macro_interpreter.Execute(search->second, std::move(parameters)); } -void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { +void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { auto debug_context = Core::System::GetInstance().GetGPUDebugContext(); // It is an error to write to a register other than the current macro's ARG register before it // has finished execution. if (executing_macro != 0) { - ASSERT(method == executing_macro + 1); + ASSERT(method_call.method == executing_macro + 1); } // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. - if (method >= MacroRegistersStart) { + if (method_call.method >= MacroRegistersStart) { // We're trying to execute a macro if (executing_macro == 0) { // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, + ASSERT_MSG((method_call.method % 2) == 0, "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; + executing_macro = method_call.method; } - macro_params.push_back(value); + macro_params.push_back(method_call.argument); // Call the macro when there are no more parameters in the command buffer - if (remaining_params == 0) { + if (method_call.IsLastCall()) { CallMacroMethod(executing_macro, std::move(macro_params)); } return; } - ASSERT_MSG(method < Regs::NUM_REGS, + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); if (debug_context) { debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandLoaded, nullptr); } - regs.reg_array[method] = value; + if (regs.reg_array[method_call.method] != method_call.argument) { + regs.reg_array[method_call.method] = method_call.argument; + // Vertex format + if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) && + method_call.method < + MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) { + dirty_flags.vertex_attrib_format = true; + } + + // Vertex buffer + if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array) && + method_call.method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) { + dirty_flags.vertex_array |= + 1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2); + } else if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array_limit) && + method_call.method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) { + dirty_flags.vertex_array |= + 1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1); + } else if (method_call.method >= MAXWELL3D_REG_INDEX(instanced_arrays) && + method_call.method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) { + dirty_flags.vertex_array |= + 1u << (method_call.method - MAXWELL3D_REG_INDEX(instanced_arrays)); + } + } - switch (method) { + switch (method_call.method) { case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(value); + ProcessMacroUpload(method_call.argument); + break; + } + case MAXWELL3D_REG_INDEX(macros.bind): { + ProcessMacroBind(method_call.argument); break; } case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): @@ -113,7 +183,7 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - ProcessCBData(value); + ProcessCBData(method_call.argument); break; } case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): { @@ -158,16 +228,20 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { } void Maxwell3D::ProcessMacroUpload(u32 data) { - // Store the uploaded macro code to interpret them when they're called. - auto& macro = uploaded_macros[regs.macros.entry * 2 + MacroRegistersStart]; - macro.push_back(data); + ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), + "upload_address exceeded macro_memory size!"); + macro_memory[regs.macros.upload_address++] = data; +} + +void Maxwell3D::ProcessMacroBind(u32 data) { + macro_offsets[regs.macros.entry] = data; } void Maxwell3D::ProcessQueryGet() { GPUVAddr sequence_address = regs.query.QueryAddress(); // Since the sequence address is given as a GPU VAddr, we have to convert it to an application // VAddr before writing. - boost::optional<VAddr> address = memory_manager.GpuToCpuAddress(sequence_address); + std::optional<VAddr> address = memory_manager.GpuToCpuAddress(sequence_address); // TODO(Subv): Support the other query units. ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, @@ -213,6 +287,7 @@ void Maxwell3D::ProcessQueryGet() { query_result.timestamp = CoreTiming::GetTicks(); Memory::WriteBlock(*address, &query_result, sizeof(query_result)); } + dirty_flags.OnMemoryWrite(); break; } default: @@ -285,10 +360,11 @@ void Maxwell3D::ProcessCBData(u32 value) { // Don't allow writing past the end of the buffer. ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size); - boost::optional<VAddr> address = + std::optional<VAddr> address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos); Memory::Write32(*address, value); + dirty_flags.OnMemoryWrite(); // Increment the current buffer position. regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; @@ -298,7 +374,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { GPUVAddr tic_base_address = regs.tic.TICAddress(); GPUVAddr tic_address_gpu = tic_base_address + tic_index * sizeof(Texture::TICEntry); - boost::optional<VAddr> tic_address_cpu = memory_manager.GpuToCpuAddress(tic_address_gpu); + std::optional<VAddr> tic_address_cpu = memory_manager.GpuToCpuAddress(tic_address_gpu); Texture::TICEntry tic_entry; Memory::ReadBlock(*tic_address_cpu, &tic_entry, sizeof(Texture::TICEntry)); @@ -322,7 +398,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { GPUVAddr tsc_base_address = regs.tsc.TSCAddress(); GPUVAddr tsc_address_gpu = tsc_base_address + tsc_index * sizeof(Texture::TSCEntry); - boost::optional<VAddr> tsc_address_cpu = memory_manager.GpuToCpuAddress(tsc_address_gpu); + std::optional<VAddr> tsc_address_cpu = memory_manager.GpuToCpuAddress(tsc_address_gpu); Texture::TSCEntry tsc_entry; Memory::ReadBlock(*tsc_address_cpu, &tsc_entry, sizeof(Texture::TSCEntry)); @@ -386,7 +462,7 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size); - boost::optional<VAddr> tex_address_cpu = memory_manager.GpuToCpuAddress(tex_info_address); + std::optional<VAddr> tex_address_cpu = memory_manager.GpuToCpuAddress(tex_info_address); Texture::TextureHandle tex_handle{Memory::Read32(*tex_address_cpu)}; Texture::FullTextureInfo tex_info{}; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 443affc36..0faff6fdf 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -42,6 +42,7 @@ public: static constexpr std::size_t NumVertexArrays = 32; static constexpr std::size_t NumVertexAttributes = 32; static constexpr std::size_t NumTextureSamplers = 32; + static constexpr std::size_t NumClipDistances = 8; static constexpr std::size_t MaxShaderProgram = 6; static constexpr std::size_t MaxShaderStage = 5; // Maximum number of const buffers per shader stage. @@ -163,6 +164,7 @@ public: return 3; default: UNREACHABLE(); + return 1; } } @@ -345,6 +347,14 @@ public: Invert = 6, IncrWrap = 7, DecrWrap = 8, + KeepOGL = 0x1E00, + ZeroOGL = 0, + ReplaceOGL = 0x1E01, + IncrOGL = 0x1E02, + DecrOGL = 0x1E03, + InvertOGL = 0x150A, + IncrWrapOGL = 0x8507, + DecrWrapOGL = 0x8508, }; enum class MemoryLayout : u32 { @@ -381,6 +391,13 @@ public: ReverseSubtract = 3, Min = 4, Max = 5, + + // These values are used by Nouveau and some games. + AddGL = 0x8006, + SubtractGL = 0x8007, + ReverseSubtractGL = 0x8008, + MinGL = 0x800a, + MaxGL = 0x800b }; enum class Factor : u32 { @@ -462,6 +479,77 @@ public: } }; + struct ColorMask { + union { + u32 raw; + BitField<0, 4, u32> R; + BitField<4, 4, u32> G; + BitField<8, 4, u32> B; + BitField<12, 4, u32> A; + }; + }; + + struct ViewportTransform { + f32 scale_x; + f32 scale_y; + f32 scale_z; + f32 translate_x; + f32 translate_y; + f32 translate_z; + INSERT_PADDING_WORDS(2); + + MathUtil::Rectangle<s32> GetRect() const { + return { + GetX(), // left + GetY() + GetHeight(), // top + GetX() + GetWidth(), // right + GetY() // bottom + }; + }; + + s32 GetX() const { + return static_cast<s32>(std::max(0.0f, translate_x - std::fabs(scale_x))); + } + + s32 GetY() const { + return static_cast<s32>(std::max(0.0f, translate_y - std::fabs(scale_y))); + } + + s32 GetWidth() const { + return static_cast<s32>(translate_x + std::fabs(scale_x)) - GetX(); + } + + s32 GetHeight() const { + return static_cast<s32>(translate_y + std::fabs(scale_y)) - GetY(); + } + }; + + struct ScissorTest { + u32 enable; + union { + BitField<0, 16, u32> min_x; + BitField<16, 16, u32> max_x; + }; + union { + BitField<0, 16, u32> min_y; + BitField<16, 16, u32> max_y; + }; + u32 fill; + }; + + struct ViewPort { + union { + BitField<0, 16, u32> x; + BitField<16, 16, u32> width; + }; + union { + BitField<0, 16, u32> y; + BitField<16, 16, u32> height; + }; + float depth_range_near; + float depth_range_far; + }; + bool IsShaderConfigEnabled(std::size_t index) const { // The VertexB is always enabled. if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) { @@ -475,66 +563,23 @@ public: INSERT_PADDING_WORDS(0x45); struct { - INSERT_PADDING_WORDS(1); + u32 upload_address; u32 data; u32 entry; + u32 bind; } macros; - INSERT_PADDING_WORDS(0x189); + INSERT_PADDING_WORDS(0x188); u32 tfb_enabled; INSERT_PADDING_WORDS(0x2E); - RenderTargetConfig rt[NumRenderTargets]; - - struct { - f32 scale_x; - f32 scale_y; - f32 scale_z; - f32 translate_x; - f32 translate_y; - f32 translate_z; - INSERT_PADDING_WORDS(2); - - MathUtil::Rectangle<s32> GetRect() const { - return { - GetX(), // left - GetY() + GetHeight(), // top - GetX() + GetWidth(), // right - GetY() // bottom - }; - }; - - s32 GetX() const { - return static_cast<s32>(std::max(0.0f, translate_x - std::fabs(scale_x))); - } - - s32 GetY() const { - return static_cast<s32>(std::max(0.0f, translate_y - std::fabs(scale_y))); - } - - s32 GetWidth() const { - return static_cast<s32>(translate_x + std::fabs(scale_x)) - GetX(); - } + std::array<RenderTargetConfig, NumRenderTargets> rt; - s32 GetHeight() const { - return static_cast<s32>(translate_y + std::fabs(scale_y)) - GetY(); - } - } viewport_transform[NumViewports]; + std::array<ViewportTransform, NumViewports> viewport_transform; - struct { - union { - BitField<0, 16, u32> x; - BitField<16, 16, u32> width; - }; - union { - BitField<0, 16, u32> y; - BitField<16, 16, u32> height; - }; - float depth_range_near; - float depth_range_far; - } viewport[NumViewports]; + std::array<ViewPort, NumViewports> viewports; INSERT_PADDING_WORDS(0x1D); @@ -547,30 +592,32 @@ public: float clear_color[4]; float clear_depth; + INSERT_PADDING_WORDS(0x3); + s32 clear_stencil; - INSERT_PADDING_WORDS(0x17); + INSERT_PADDING_WORDS(0x7); - struct { - u32 enable; - union { - BitField<0, 16, u32> min_x; - BitField<16, 16, u32> max_x; - }; - union { - BitField<0, 16, u32> min_y; - BitField<16, 16, u32> max_y; - }; - } scissor_test; + u32 polygon_offset_point_enable; + u32 polygon_offset_line_enable; + u32 polygon_offset_fill_enable; + + INSERT_PADDING_WORDS(0xD); + + std::array<ScissorTest, NumViewports> scissor_test; - INSERT_PADDING_WORDS(0x52); + INSERT_PADDING_WORDS(0x15); s32 stencil_back_func_ref; u32 stencil_back_mask; u32 stencil_back_func_mask; - INSERT_PADDING_WORDS(0x13); + INSERT_PADDING_WORDS(0xC); + + u32 color_mask_common; + + INSERT_PADDING_WORDS(0x6); u32 rt_separate_frag_data; @@ -594,7 +641,16 @@ public: } } zeta; - INSERT_PADDING_WORDS(0x5B); + INSERT_PADDING_WORDS(0x41); + + union { + BitField<0, 4, u32> stencil; + BitField<4, 4, u32> unknown; + BitField<8, 4, u32> scissor; + BitField<12, 4, u32> viewport; + } clear_flags; + + INSERT_PADDING_WORDS(0x19); std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; @@ -645,8 +701,14 @@ public: ComparisonOp depth_test_func; float alpha_test_ref; ComparisonOp alpha_test_func; - - INSERT_PADDING_WORDS(0x9); + u32 draw_tfb_stride; + struct { + float r; + float g; + float b; + float a; + } blend_color; + INSERT_PADDING_WORDS(0x4); struct { u32 separate_alpha; @@ -671,9 +733,12 @@ public: u32 stencil_front_func_mask; u32 stencil_front_mask; - INSERT_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS(0x2); + + u32 frag_color_clamp; union { + BitField<0, 1, u32> y_negate; BitField<4, 1, u32> triangle_rast_flip; } screen_y_control; @@ -681,7 +746,20 @@ public: u32 vb_element_base; - INSERT_PADDING_WORDS(0x38); + INSERT_PADDING_WORDS(0x36); + + union { + BitField<0, 1, u32> c0; + BitField<1, 1, u32> c1; + BitField<2, 1, u32> c2; + BitField<3, 1, u32> c3; + BitField<4, 1, u32> c4; + BitField<5, 1, u32> c5; + BitField<6, 1, u32> c6; + BitField<7, 1, u32> c7; + } clip_distance_enabled; + + INSERT_PADDING_WORDS(0x1); float point_size; @@ -689,7 +767,12 @@ public: u32 zeta_enable; - INSERT_PADDING_WORDS(0x8); + union { + BitField<0, 1, u32> alpha_to_coverage; + BitField<4, 1, u32> alpha_to_one; + } multisample_control; + + INSERT_PADDING_WORDS(0x7); struct { u32 tsc_address_high; @@ -702,7 +785,11 @@ public: } } tsc; - INSERT_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS(0x1); + + float polygon_offset_factor; + + INSERT_PADDING_WORDS(0x1); struct { u32 tic_address_high; @@ -727,7 +814,9 @@ public: u32 framebuffer_srgb; - INSERT_PADDING_WORDS(0x12); + float polygon_offset_units; + + INSERT_PADDING_WORDS(0x11); union { BitField<2, 1, u32> coord_origin; @@ -783,6 +872,7 @@ public: return 4; } UNREACHABLE(); + return 1; } GPUVAddr StartAddress() const { @@ -804,7 +894,9 @@ public: INSERT_PADDING_WORDS(0x7); - INSERT_PADDING_WORDS(0x20); + INSERT_PADDING_WORDS(0x1F); + + float polygon_offset_clamp; struct { u32 is_instanced[NumVertexArrays]; @@ -820,8 +912,21 @@ public: Cull cull; - INSERT_PADDING_WORDS(0x28); + u32 pixel_center_integer; + + INSERT_PADDING_WORDS(0x1); + + u32 viewport_transform_enabled; + + INSERT_PADDING_WORDS(0x3); + + union { + BitField<0, 1, u32> depth_range_0_1; + BitField<3, 1, u32> depth_clamp_near; + BitField<4, 1, u32> depth_clamp_far; + } view_volume_clip_control; + INSERT_PADDING_WORDS(0x21); struct { u32 enable; LogicOperation operation; @@ -840,8 +945,9 @@ public: BitField<6, 4, u32> RT; BitField<10, 11, u32> layer; } clear_buffers; - - INSERT_PADDING_WORDS(0x4B); + INSERT_PADDING_WORDS(0xB); + std::array<ColorMask, NumRenderTargets> color_mask; + INSERT_PADDING_WORDS(0x38); struct { u32 query_address_high; @@ -982,11 +1088,22 @@ public: State state{}; MemoryManager& memory_manager; + struct DirtyFlags { + bool vertex_attrib_format = true; + u32 vertex_array = 0xFFFFFFFF; + + void OnMemoryWrite() { + vertex_array = 0xFFFFFFFF; + } + }; + + DirtyFlags dirty_flags; + /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value, u32 remaining_params); + void CallMethod(const GPU::MethodCall& method_call); /// Returns a list of enabled textures for the specified shader stage. std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const; @@ -994,12 +1111,25 @@ public: /// Returns the texture information for a specific texture in a specific shader stage. Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const; + /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than + /// we've seen used. + using MacroMemory = std::array<u32, 0x40000>; + + /// Gets a reference to macro memory. + const MacroMemory& GetMacroMemory() const { + return macro_memory; + } + private: void InitializeRegisterDefaults(); VideoCore::RasterizerInterface& rasterizer; - std::unordered_map<u32, std::vector<u32>> uploaded_macros; + /// Start offsets of each macro in macro_memory + std::unordered_map<u32, u32> macro_offsets; + + /// Memory for macro code + MacroMemory macro_memory; /// Macro method that is currently being executed / being fed parameters. u32 executing_macro = 0; @@ -1022,9 +1152,12 @@ private: */ void CallMacroMethod(u32 method, std::vector<u32> parameters); - /// Handles writes to the macro uploading registers. + /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); + /// Handles writes to the macro bind register. + void ProcessMacroBind(u32 data); + /// Handles a write to the CLEAR_BUFFERS register. void ProcessClearBuffers(); @@ -1048,18 +1181,23 @@ private: ASSERT_REG_POSITION(macros, 0x45); ASSERT_REG_POSITION(tfb_enabled, 0x1D1); ASSERT_REG_POSITION(rt, 0x200); -ASSERT_REG_POSITION(viewport_transform[0], 0x280); -ASSERT_REG_POSITION(viewport, 0x300); +ASSERT_REG_POSITION(viewport_transform, 0x280); +ASSERT_REG_POSITION(viewports, 0x300); ASSERT_REG_POSITION(vertex_buffer, 0x35D); ASSERT_REG_POSITION(clear_color[0], 0x360); ASSERT_REG_POSITION(clear_depth, 0x364); ASSERT_REG_POSITION(clear_stencil, 0x368); +ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); +ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); +ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); ASSERT_REG_POSITION(scissor_test, 0x380); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); +ASSERT_REG_POSITION(color_mask_common, 0x3E4); ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(zeta, 0x3F8); +ASSERT_REG_POSITION(clear_flags, 0x43E); ASSERT_REG_POSITION(vertex_attrib_format, 0x458); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); @@ -1070,6 +1208,10 @@ ASSERT_REG_POSITION(depth_write_enabled, 0x4BA); ASSERT_REG_POSITION(alpha_test_enabled, 0x4BB); ASSERT_REG_POSITION(d3d_cull_mode, 0x4C2); ASSERT_REG_POSITION(depth_test_func, 0x4C3); +ASSERT_REG_POSITION(alpha_test_ref, 0x4C4); +ASSERT_REG_POSITION(alpha_test_func, 0x4C5); +ASSERT_REG_POSITION(draw_tfb_stride, 0x4C6); +ASSERT_REG_POSITION(blend_color, 0x4C7); ASSERT_REG_POSITION(blend, 0x4CF); ASSERT_REG_POSITION(stencil_enable, 0x4E0); ASSERT_REG_POSITION(stencil_front_op_fail, 0x4E1); @@ -1079,11 +1221,15 @@ ASSERT_REG_POSITION(stencil_front_func_func, 0x4E4); ASSERT_REG_POSITION(stencil_front_func_ref, 0x4E5); ASSERT_REG_POSITION(stencil_front_func_mask, 0x4E6); ASSERT_REG_POSITION(stencil_front_mask, 0x4E7); +ASSERT_REG_POSITION(frag_color_clamp, 0x4EA); ASSERT_REG_POSITION(screen_y_control, 0x4EB); ASSERT_REG_POSITION(vb_element_base, 0x50D); +ASSERT_REG_POSITION(clip_distance_enabled, 0x544); ASSERT_REG_POSITION(point_size, 0x546); ASSERT_REG_POSITION(zeta_enable, 0x54E); +ASSERT_REG_POSITION(multisample_control, 0x54F); ASSERT_REG_POSITION(tsc, 0x557); +ASSERT_REG_POSITION(polygon_offset_factor, 0x55b); ASSERT_REG_POSITION(tic, 0x55D); ASSERT_REG_POSITION(stencil_two_side_enable, 0x565); ASSERT_REG_POSITION(stencil_back_op_fail, 0x566); @@ -1091,15 +1237,21 @@ ASSERT_REG_POSITION(stencil_back_op_zfail, 0x567); ASSERT_REG_POSITION(stencil_back_op_zpass, 0x568); ASSERT_REG_POSITION(stencil_back_func_func, 0x569); ASSERT_REG_POSITION(framebuffer_srgb, 0x56E); +ASSERT_REG_POSITION(polygon_offset_units, 0x56F); ASSERT_REG_POSITION(point_coord_replace, 0x581); ASSERT_REG_POSITION(code_address, 0x582); ASSERT_REG_POSITION(draw, 0x585); ASSERT_REG_POSITION(primitive_restart, 0x591); ASSERT_REG_POSITION(index_array, 0x5F2); +ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); ASSERT_REG_POSITION(cull, 0x646); +ASSERT_REG_POSITION(pixel_center_integer, 0x649); +ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); +ASSERT_REG_POSITION(view_volume_clip_control, 0x64F); ASSERT_REG_POSITION(logic_op, 0x671); ASSERT_REG_POSITION(clear_buffers, 0x674); +ASSERT_REG_POSITION(color_mask, 0x680); ASSERT_REG_POSITION(query, 0x6C0); ASSERT_REG_POSITION(vertex_array[0], 0x700); ASSERT_REG_POSITION(independent_blend, 0x780); diff --git a/src/video_core/engines/maxwell_compute.cpp b/src/video_core/engines/maxwell_compute.cpp index 8b5f08351..656db6a61 100644 --- a/src/video_core/engines/maxwell_compute.cpp +++ b/src/video_core/engines/maxwell_compute.cpp @@ -8,13 +8,13 @@ namespace Tegra::Engines { -void MaxwellCompute::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void MaxwellCompute::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid MaxwellCompute register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; - switch (method) { + switch (method_call.method) { case MAXWELL_COMPUTE_REG_INDEX(compute): { LOG_CRITICAL(HW_GPU, "Compute shaders are not implemented"); UNREACHABLE(); diff --git a/src/video_core/engines/maxwell_compute.h b/src/video_core/engines/maxwell_compute.h index 6ea934fb9..1d71f11bd 100644 --- a/src/video_core/engines/maxwell_compute.h +++ b/src/video_core/engines/maxwell_compute.h @@ -9,6 +9,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/gpu.h" namespace Tegra::Engines { @@ -42,7 +43,7 @@ public: "MaxwellCompute Regs has wrong size"); /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index b8a78cf82..06462f570 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -2,7 +2,9 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "core/core.h" #include "core/memory.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/rasterizer_interface.h" #include "video_core/textures/decoders.h" @@ -12,16 +14,16 @@ namespace Tegra::Engines { MaxwellDMA::MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : memory_manager(memory_manager), rasterizer{rasterizer} {} -void MaxwellDMA::WriteReg(u32 method, u32 value) { - ASSERT_MSG(method < Regs::NUM_REGS, +void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { + ASSERT_MSG(method_call.method < Regs::NUM_REGS, "Invalid MaxwellDMA register, increase the size of the Regs structure"); - regs.reg_array[method] = value; + regs.reg_array[method_call.method] = method_call.argument; #define MAXWELLDMA_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32)) - switch (method) { + switch (method_call.method) { case MAXWELLDMA_REG_INDEX(exec): { HandleCopy(); break; @@ -54,6 +56,9 @@ void MaxwellDMA::HandleCopy() { return; } + // All copies here update the main memory, so mark all rasterizer states as invalid. + Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count, diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 5f3704f05..1f8cd65d2 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -24,7 +24,7 @@ public: ~MaxwellDMA() = default; /// Write the value to the register identified by method. - void WriteReg(u32 method, u32 value); + void CallMethod(const GPU::MethodCall& method_call); struct Regs { static constexpr std::size_t NUM_REGS = 0x1D6; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 141b9159b..e53c77f2b 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -5,12 +5,11 @@ #pragma once #include <bitset> +#include <optional> #include <string> #include <tuple> #include <vector> -#include <boost/optional.hpp> - #include "common/assert.h" #include "common/bit_field.h" #include "common/common_types.h" @@ -83,6 +82,8 @@ union Attribute { Position = 7, Attribute_0 = 8, Attribute_31 = 39, + ClipDistances0123 = 44, + ClipDistances4567 = 45, PointCoord = 46, // This attribute contains a tuple of (~, ~, InstanceId, VertexId) when inside a vertex // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval @@ -154,6 +155,7 @@ enum class PredCondition : u64 { NotEqual = 5, GreaterEqual = 6, LessThanWithNan = 9, + LessEqualWithNan = 11, GreaterThanWithNan = 12, NotEqualWithNan = 13, GreaterEqualWithNan = 14, @@ -262,7 +264,7 @@ enum class FlowCondition : u64 { Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for? }; -enum class ControlCode : u64 { +enum class ConditionCode : u64 { F = 0, LT = 1, EQ = 2, @@ -366,6 +368,11 @@ enum class HalfPrecision : u64 { FMZ = 2, }; +enum class R2pMode : u64 { + Pr = 0, + Cc = 1, +}; + enum class IpaInterpMode : u64 { Linear = 0, Perspective = 1, @@ -568,9 +575,8 @@ union Instruction { union { BitField<39, 2, u64> tab5cb8_2; - BitField<41, 3, u64> tab5c68_1; + BitField<41, 3, u64> postfactor; BitField<44, 2, u64> tab5c68_0; - BitField<47, 1, u64> cc; BitField<48, 1, u64> negate_b; } fmul; @@ -579,6 +585,10 @@ union Instruction { } fmul32; union { + BitField<52, 1, u64> generates_cc; + } op_32; + + union { BitField<48, 1, u64> is_signed; } shift; @@ -599,7 +609,7 @@ union Instruction { BitField<31, 1, u64> negate_b; BitField<30, 1, u64> abs_b; - BitField<47, 2, HalfType> type_b; + BitField<28, 2, HalfType> type_b; BitField<35, 2, HalfType> type_c; } alu_half; @@ -828,7 +838,7 @@ union Instruction { union { BitField<0, 3, u64> pred0; BitField<3, 3, u64> pred3; - BitField<8, 5, ControlCode> cc; // flag in cc + BitField<8, 5, ConditionCode> cc; // flag in cc BitField<39, 3, u64> pred39; BitField<42, 1, u64> neg_pred39; BitField<45, 4, PredOperation> op; // op with pred39 @@ -852,6 +862,12 @@ union Instruction { } hsetp2; union { + BitField<40, 1, R2pMode> mode; + BitField<41, 2, u64> byte; + BitField<20, 7, u64> immediate_mask; + } r2p; + + union { BitField<39, 3, u64> pred39; BitField<42, 1, u64> neg_pred; BitField<43, 1, u64> neg_a; @@ -1033,6 +1049,7 @@ union Instruction { BitField<49, 1, u64> nodep_flag; BitField<50, 3, u64> component_mask_selector; BitField<53, 4, u64> texture_info; + BitField<59, 1, u64> fp32_flag; TextureType GetTextureType() const { // The TEXS instruction has a weird encoding for the texture type. @@ -1048,6 +1065,7 @@ union Instruction { LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", static_cast<u32>(texture_info.Value())); UNREACHABLE(); + return TextureType::Texture1D; } TextureProcessMode GetTextureProcessMode() const { @@ -1128,6 +1146,7 @@ union Instruction { LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", static_cast<u32>(texture_info.Value())); UNREACHABLE(); + return TextureType::Texture1D; } TextureProcessMode GetTextureProcessMode() const { @@ -1232,6 +1251,7 @@ union Instruction { BitField<60, 1, u64> is_b_gpr; BitField<59, 1, u64> is_c_gpr; BitField<20, 24, s64> smem_imm; + BitField<0, 5, ConditionCode> flow_condition_code; Attribute attribute; Sampler sampler; @@ -1252,6 +1272,7 @@ public: BFE_C, BFE_R, BFE_IMM, + BFI_IMM_R, BRA, PBK, LD_A, @@ -1377,6 +1398,7 @@ public: PSETP, PSET, CSETP, + R2P_IMM, XMAD_IMM, XMAD_CR, XMAD_RC, @@ -1392,6 +1414,7 @@ public: ArithmeticHalf, ArithmeticHalfImmediate, Bfe, + Bfi, Shift, Ffma, Hfma2, @@ -1406,6 +1429,7 @@ public: HalfSetPredicate, PredicateSetPredicate, PredicateSetRegister, + RegisterSetPredicate, Conversion, Xmad, Unknown, @@ -1456,7 +1480,7 @@ public: Type type; }; - static boost::optional<const Matcher&> Decode(Instruction instr) { + static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) { static const auto table{GetDecodeTable()}; const auto matches_instruction = [instr](const auto& matcher) { @@ -1464,7 +1488,8 @@ public: }; auto iter = std::find_if(table.begin(), table.end(), matches_instruction); - return iter != table.end() ? boost::optional<const Matcher&>(*iter) : boost::none; + return iter != table.end() ? std::optional<std::reference_wrapper<const Matcher>>(*iter) + : std::nullopt; } private: @@ -1527,7 +1552,7 @@ private: INST("1110111011011---", Id::STG, Type::Memory, "STG"), INST("110000----111---", Id::TEX, Type::Memory, "TEX"), INST("1101111101001---", Id::TXQ, Type::Memory, "TXQ"), - INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"), + INST("1101-00---------", Id::TEXS, Type::Memory, "TEXS"), INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"), INST("110010----111---", Id::TLD4, Type::Memory, "TLD4"), INST("1101111100------", Id::TLD4S, Type::Memory, "TLD4S"), @@ -1608,6 +1633,7 @@ private: INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"), INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"), INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"), + INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"), INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"), INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"), INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"), @@ -1642,6 +1668,7 @@ private: INST("0101000010001---", Id::PSET, Type::PredicateSetRegister, "PSET"), INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"), INST("010100001010----", Id::CSETP, Type::PredicateSetPredicate, "CSETP"), + INST("0011100-11110---", Id::R2P_IMM, Type::RegisterSetPredicate, "R2P_IMM"), INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"), INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"), INST("010100010-------", Id::XMAD_RC, Type::Xmad, "XMAD_RC"), @@ -1658,4 +1685,4 @@ private: } }; -} // namespace Tegra::Shader
\ No newline at end of file +} // namespace Tegra::Shader diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h index a0e015c4b..99c34649f 100644 --- a/src/video_core/engines/shader_header.h +++ b/src/video_core/engines/shader_header.h @@ -62,7 +62,16 @@ struct Header { INSERT_PADDING_BYTES(1); // ImapSystemValuesB INSERT_PADDING_BYTES(16); // ImapGenericVector[32] INSERT_PADDING_BYTES(2); // ImapColor - INSERT_PADDING_BYTES(2); // ImapSystemValuesC + union { + BitField<0, 8, u16> clip_distances; + BitField<8, 1, u16> point_sprite_s; + BitField<9, 1, u16> point_sprite_t; + BitField<10, 1, u16> fog_coordinate; + BitField<12, 1, u16> tessellation_eval_point_u; + BitField<13, 1, u16> tessellation_eval_point_v; + BitField<14, 1, u16> instance_id; + BitField<15, 1, u16> vertex_id; + }; INSERT_PADDING_BYTES(5); // ImapFixedFncTexture[10] INSERT_PADDING_BYTES(1); // ImapReserved INSERT_PADDING_BYTES(3); // OmapSystemValuesA diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 83c7e5b0b..08cf6268f 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -17,6 +17,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) { switch (format) { case PixelFormat::ABGR8: return 4; + default: + return 4; } UNREACHABLE(); @@ -24,6 +26,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) { GPU::GPU(VideoCore::RasterizerInterface& rasterizer) { memory_manager = std::make_unique<Tegra::MemoryManager>(); + dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager); fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); maxwell_compute = std::make_unique<Engines::MaxwellCompute>(); @@ -49,6 +52,14 @@ const MemoryManager& GPU::MemoryManager() const { return *memory_manager; } +DmaPusher& GPU::DmaPusher() { + return *dma_pusher; +} + +const DmaPusher& GPU::DmaPusher() const { + return *dma_pusher; +} + u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { ASSERT(format != RenderTargetFormat::NONE); @@ -91,6 +102,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { return 1; default: UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format)); + return 1; } } @@ -108,6 +120,55 @@ u32 DepthFormatBytesPerPixel(DepthFormat format) { return 2; default: UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format)); + return 1; + } +} + +enum class BufferMethods { + BindObject = 0, + CountBufferMethods = 0x40, +}; + +void GPU::CallMethod(const MethodCall& method_call) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, + method_call.subchannel); + + ASSERT(method_call.subchannel < bound_engines.size()); + + if (method_call.method == static_cast<u32>(BufferMethods::BindObject)) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, + method_call.argument); + bound_engines[method_call.subchannel] = static_cast<EngineID>(method_call.argument); + return; + } + + if (method_call.method < static_cast<u32>(BufferMethods::CountBufferMethods)) { + // TODO(Subv): Research and implement these methods. + LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); + return; + } + + const EngineID engine = bound_engines[method_call.subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMethod(method_call); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMethod(method_call); + break; + case EngineID::MAXWELL_COMPUTE_B: + maxwell_compute->CallMethod(method_call); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMethod(method_call); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMethod(method_call); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); } } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 5cc1e19ca..af5ccd1e9 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -9,6 +9,7 @@ #include <vector> #include "common/common_types.h" #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/dma_pusher.h" #include "video_core/memory_manager.h" namespace VideoCore { @@ -119,8 +120,23 @@ public: explicit GPU(VideoCore::RasterizerInterface& rasterizer); ~GPU(); - /// Processes a command list stored at the specified address in GPU memory. - void ProcessCommandLists(const std::vector<CommandListHeader>& commands); + struct MethodCall { + u32 method{}; + u32 argument{}; + u32 subchannel{}; + u32 method_count{}; + + bool IsLastCall() const { + return method_count <= 1; + } + + MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0) + : method(method), argument(argument), subchannel(subchannel), + method_count(method_count) {} + }; + + /// Calls a GPU method. + void CallMethod(const MethodCall& method_call); /// Returns a reference to the Maxwell3D GPU engine. Engines::Maxwell3D& Maxwell3D(); @@ -134,7 +150,14 @@ public: /// Returns a const reference to the GPU memory manager. const Tegra::MemoryManager& MemoryManager() const; + /// Returns a reference to the GPU DMA pusher. + Tegra::DmaPusher& DmaPusher(); + + /// Returns a const reference to the GPU DMA pusher. + const Tegra::DmaPusher& DmaPusher() const; + private: + std::unique_ptr<Tegra::DmaPusher> dma_pusher; std::unique_ptr<Tegra::MemoryManager> memory_manager; /// Mapping of command subchannels to their bound engine ids. diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp index 377bd66ab..64f75db43 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro_interpreter.cpp @@ -11,7 +11,7 @@ namespace Tegra { MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} -void MacroInterpreter::Execute(const std::vector<u32>& code, std::vector<u32> parameters) { +void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) { Reset(); registers[1] = parameters[0]; this->parameters = std::move(parameters); @@ -19,7 +19,7 @@ void MacroInterpreter::Execute(const std::vector<u32>& code, std::vector<u32> pa // Execute the code until we hit an exit condition. bool keep_executing = true; while (keep_executing) { - keep_executing = Step(code, false); + keep_executing = Step(offset, false); } // Assert the the macro used all the input parameters @@ -29,25 +29,26 @@ void MacroInterpreter::Execute(const std::vector<u32>& code, std::vector<u32> pa void MacroInterpreter::Reset() { registers = {}; pc = 0; - delayed_pc = boost::none; + delayed_pc = {}; method_address.raw = 0; parameters.clear(); // The next parameter index starts at 1, because $r1 already has the value of the first // parameter. next_parameter_index = 1; + carry_flag = false; } -bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) { +bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { u32 base_address = pc; - Opcode opcode = GetOpcode(code); + Opcode opcode = GetOpcode(offset); pc += 4; // Update the program counter if we were delayed - if (delayed_pc != boost::none) { + if (delayed_pc) { ASSERT(is_delay_slot); pc = *delayed_pc; - delayed_pc = boost::none; + delayed_pc = {}; } switch (opcode.operation) { @@ -108,7 +109,7 @@ bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) { delayed_pc = base_address + opcode.GetBranchTarget(); // Execute one more instruction due to the delay slot. - return Step(code, true); + return Step(offset, true); } break; } @@ -121,27 +122,42 @@ bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) { // Exit has a delay slot, execute the next instruction // Note: Executing an exit during a branch delay slot will cause the instruction at the // branch target to be executed before exiting. - Step(code, true); + Step(offset, true); return false; } return true; } -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(const std::vector<u32>& code) const { +MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { + const auto& macro_memory{maxwell3d.GetMacroMemory()}; ASSERT((pc % sizeof(u32)) == 0); - ASSERT(pc < code.size() * sizeof(u32)); - return {code[pc / sizeof(u32)]}; + ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); + return {macro_memory[offset + pc / sizeof(u32)]}; } -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const { +u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { switch (operation) { - case ALUOperation::Add: - return src_a + src_b; - // TODO(Subv): Implement AddWithCarry - case ALUOperation::Subtract: - return src_a - src_b; - // TODO(Subv): Implement SubtractWithBorrow + case ALUOperation::Add: { + const u64 result{static_cast<u64>(src_a) + src_b}; + carry_flag = result > 0xffffffff; + return static_cast<u32>(result); + } + case ALUOperation::AddWithCarry: { + const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; + carry_flag = result > 0xffffffff; + return static_cast<u32>(result); + } + case ALUOperation::Subtract: { + const u64 result{static_cast<u64>(src_a) - src_b}; + carry_flag = result < 0x100000000; + return static_cast<u32>(result); + } + case ALUOperation::SubtractWithBorrow: { + const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; + carry_flag = result < 0x100000000; + return static_cast<u32>(result); + } case ALUOperation::Xor: return src_a ^ src_b; case ALUOperation::Or: @@ -155,6 +171,7 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) default: UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation)); + return 0; } } @@ -234,7 +251,7 @@ void MacroInterpreter::SetMethodAddress(u32 address) { } void MacroInterpreter::Send(u32 value) { - maxwell3d.WriteReg(method_address.address, value, 0); + maxwell3d.CallMethod({method_address.address, value}); // Increment the method address by the method increment. method_address.address.Assign(method_address.address.Value() + method_address.increment.Value()); @@ -252,6 +269,7 @@ bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) return value != 0; } UNREACHABLE(); + return true; } } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h index cee0baaf3..cde360288 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro_interpreter.h @@ -5,8 +5,9 @@ #pragma once #include <array> +#include <optional> #include <vector> -#include <boost/optional.hpp> + #include "common/bit_field.h" #include "common/common_types.h" @@ -21,10 +22,10 @@ public: /** * Executes the macro code with the specified input parameters. - * @param code The macro byte code to execute - * @param parameters The parameters of the macro + * @param offset Offset to start execution at. + * @param parameters The parameters of the macro. */ - void Execute(const std::vector<u32>& code, std::vector<u32> parameters); + void Execute(u32 offset, std::vector<u32> parameters); private: enum class Operation : u32 { @@ -109,14 +110,14 @@ private: /** * Executes a single macro instruction located at the current program counter. Returns whether * the interpreter should keep running. - * @param code The macro code to execute. + * @param offset Offset to start execution at. * @param is_delay_slot Whether the current step is being executed due to a delay slot in a * previous instruction. */ - bool Step(const std::vector<u32>& code, bool is_delay_slot); + bool Step(u32 offset, bool is_delay_slot); /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const; + u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); /// Performs the result operation on the input result and stores it in the specified register /// (if necessary). @@ -126,7 +127,7 @@ private: bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; /// Reads an opcode at the current program counter location. - Opcode GetOpcode(const std::vector<u32>& code) const; + Opcode GetOpcode(u32 offset) const; /// Returns the specified register's value. Register 0 is hardcoded to always return 0. u32 GetRegister(u32 register_id) const; @@ -149,7 +150,7 @@ private: Engines::Maxwell3D& maxwell3d; u32 pc; ///< Current program counter - boost::optional<u32> + std::optional<u32> delayed_pc; ///< Program counter to execute at after the delay slot is executed. static constexpr std::size_t NumMacroRegisters = 8; @@ -164,5 +165,7 @@ private: std::vector<u32> parameters; /// Index of the next parameter that will be fetched by the 'parm' instruction. u32 next_parameter_index = 0; + + bool carry_flag{}; }; } // namespace Tegra diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 022d4ab74..47247f097 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -4,18 +4,28 @@ #include "common/alignment.h" #include "common/assert.h" +#include "common/logging/log.h" #include "video_core/memory_manager.h" namespace Tegra { +MemoryManager::MemoryManager() { + // Mark the first page as reserved, so that 0 is not a valid GPUVAddr. Otherwise, games might + // try to use 0 as a valid address, which is also used to mean nullptr. This fixes a bug with + // Undertale using 0 for a render target. + PageSlot(0) = static_cast<u64>(PageStatus::Reserved); +} + GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) { - boost::optional<GPUVAddr> gpu_addr = FindFreeBlock(size, align); - ASSERT(gpu_addr); + const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, align, PageStatus::Unmapped)}; + + ASSERT_MSG(gpu_addr, "unable to find available GPU memory"); - for (u64 offset = 0; offset < size; offset += PAGE_SIZE) { - VAddr& slot = PageSlot(*gpu_addr + offset); + for (u64 offset{}; offset < size; offset += PAGE_SIZE) { + VAddr& slot{PageSlot(*gpu_addr + offset)}; ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); + slot = static_cast<u64>(PageStatus::Allocated); } @@ -23,10 +33,11 @@ GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) { } GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) { - for (u64 offset = 0; offset < size; offset += PAGE_SIZE) { - VAddr& slot = PageSlot(gpu_addr + offset); + for (u64 offset{}; offset < size; offset += PAGE_SIZE) { + VAddr& slot{PageSlot(gpu_addr + offset)}; ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); + slot = static_cast<u64>(PageStatus::Allocated); } @@ -34,17 +45,19 @@ GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) { } GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) { - boost::optional<GPUVAddr> gpu_addr = FindFreeBlock(size, PAGE_SIZE); - ASSERT(gpu_addr); + const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, PAGE_SIZE, PageStatus::Unmapped)}; - for (u64 offset = 0; offset < size; offset += PAGE_SIZE) { - VAddr& slot = PageSlot(*gpu_addr + offset); + ASSERT_MSG(gpu_addr, "unable to find available GPU memory"); + + for (u64 offset{}; offset < size; offset += PAGE_SIZE) { + VAddr& slot{PageSlot(*gpu_addr + offset)}; ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); + slot = cpu_addr + offset; } - MappedRegion region{cpu_addr, *gpu_addr, size}; + const MappedRegion region{cpu_addr, *gpu_addr, size}; mapped_regions.push_back(region); return *gpu_addr; @@ -53,14 +66,31 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) { GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) { ASSERT((gpu_addr & PAGE_MASK) == 0); - for (u64 offset = 0; offset < size; offset += PAGE_SIZE) { - VAddr& slot = PageSlot(gpu_addr + offset); + if (PageSlot(gpu_addr) != static_cast<u64>(PageStatus::Allocated)) { + // Page has been already mapped. In this case, we must find a new area of memory to use that + // is different than the specified one. Super Mario Odyssey hits this scenario when changing + // areas, but we do not want to overwrite the old pages. + // TODO(bunnei): We need to write a hardware test to confirm this behavior. + + LOG_ERROR(HW_GPU, "attempting to map addr 0x{:016X}, which is not available!", gpu_addr); + + const std::optional<GPUVAddr> new_gpu_addr{ + FindFreeBlock(gpu_addr, size, PAGE_SIZE, PageStatus::Allocated)}; + + ASSERT_MSG(new_gpu_addr, "unable to find available GPU memory"); + + gpu_addr = *new_gpu_addr; + } + + for (u64 offset{}; offset < size; offset += PAGE_SIZE) { + VAddr& slot{PageSlot(gpu_addr + offset)}; ASSERT(slot == static_cast<u64>(PageStatus::Allocated)); + slot = cpu_addr + offset; } - MappedRegion region{cpu_addr, gpu_addr, size}; + const MappedRegion region{cpu_addr, gpu_addr, size}; mapped_regions.push_back(region); return gpu_addr; @@ -69,11 +99,12 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { ASSERT((gpu_addr & PAGE_MASK) == 0); - for (u64 offset = 0; offset < size; offset += PAGE_SIZE) { - VAddr& slot = PageSlot(gpu_addr + offset); + for (u64 offset{}; offset < size; offset += PAGE_SIZE) { + VAddr& slot{PageSlot(gpu_addr + offset)}; ASSERT(slot != static_cast<u64>(PageStatus::Allocated) && slot != static_cast<u64>(PageStatus::Unmapped)); + slot = static_cast<u64>(PageStatus::Unmapped); } @@ -97,13 +128,14 @@ GPUVAddr MemoryManager::GetRegionEnd(GPUVAddr region_start) const { return {}; } -boost::optional<GPUVAddr> MemoryManager::FindFreeBlock(u64 size, u64 align) { - GPUVAddr gpu_addr = 0; - u64 free_space = 0; +std::optional<GPUVAddr> MemoryManager::FindFreeBlock(GPUVAddr region_start, u64 size, u64 align, + PageStatus status) { + GPUVAddr gpu_addr{region_start}; + u64 free_space{}; align = (align + PAGE_MASK) & ~PAGE_MASK; while (gpu_addr + free_space < MAX_ADDRESS) { - if (!IsPageMapped(gpu_addr + free_space)) { + if (PageSlot(gpu_addr + free_space) == static_cast<u64>(status)) { free_space += PAGE_SIZE; if (free_space >= size) { return gpu_addr; @@ -118,8 +150,8 @@ boost::optional<GPUVAddr> MemoryManager::FindFreeBlock(u64 size, u64 align) { return {}; } -boost::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) { - VAddr base_addr = PageSlot(gpu_addr); +std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) { + const VAddr base_addr{PageSlot(gpu_addr)}; if (base_addr == static_cast<u64>(PageStatus::Allocated) || base_addr == static_cast<u64>(PageStatus::Unmapped)) { @@ -133,19 +165,15 @@ std::vector<GPUVAddr> MemoryManager::CpuToGpuAddress(VAddr cpu_addr) const { std::vector<GPUVAddr> results; for (const auto& region : mapped_regions) { if (cpu_addr >= region.cpu_addr && cpu_addr < (region.cpu_addr + region.size)) { - u64 offset = cpu_addr - region.cpu_addr; + const u64 offset{cpu_addr - region.cpu_addr}; results.push_back(region.gpu_addr + offset); } } return results; } -bool MemoryManager::IsPageMapped(GPUVAddr gpu_addr) { - return PageSlot(gpu_addr) != static_cast<u64>(PageStatus::Unmapped); -} - VAddr& MemoryManager::PageSlot(GPUVAddr gpu_addr) { - auto& block = page_table[(gpu_addr >> (PAGE_BITS + PAGE_TABLE_BITS)) & PAGE_TABLE_MASK]; + auto& block{page_table[(gpu_addr >> (PAGE_BITS + PAGE_TABLE_BITS)) & PAGE_TABLE_MASK]}; if (!block) { block = std::make_unique<PageBlock>(); block->fill(static_cast<VAddr>(PageStatus::Unmapped)); diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index caf80093f..fb03497ca 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -6,10 +6,9 @@ #include <array> #include <memory> +#include <optional> #include <vector> -#include <boost/optional.hpp> - #include "common/common_types.h" namespace Tegra { @@ -19,7 +18,7 @@ using GPUVAddr = u64; class MemoryManager final { public: - MemoryManager() = default; + MemoryManager(); GPUVAddr AllocateSpace(u64 size, u64 align); GPUVAddr AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align); @@ -27,7 +26,7 @@ public: GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size); GPUVAddr UnmapBuffer(GPUVAddr gpu_addr, u64 size); GPUVAddr GetRegionEnd(GPUVAddr region_start) const; - boost::optional<VAddr> GpuToCpuAddress(GPUVAddr gpu_addr); + std::optional<VAddr> GpuToCpuAddress(GPUVAddr gpu_addr); std::vector<GPUVAddr> CpuToGpuAddress(VAddr cpu_addr) const; static constexpr u64 PAGE_BITS = 16; @@ -35,15 +34,16 @@ public: static constexpr u64 PAGE_MASK = PAGE_SIZE - 1; private: - boost::optional<GPUVAddr> FindFreeBlock(u64 size, u64 align = 1); - bool IsPageMapped(GPUVAddr gpu_addr); - VAddr& PageSlot(GPUVAddr gpu_addr); - enum class PageStatus : u64 { Unmapped = 0xFFFFFFFFFFFFFFFFULL, Allocated = 0xFFFFFFFFFFFFFFFEULL, + Reserved = 0xFFFFFFFFFFFFFFFDULL, }; + std::optional<GPUVAddr> FindFreeBlock(GPUVAddr region_start, u64 size, u64 align, + PageStatus status); + VAddr& PageSlot(GPUVAddr gpu_addr); + static constexpr u64 MAX_ADDRESS{0x10000000000ULL}; static constexpr u64 PAGE_TABLE_BITS{10}; static constexpr u64 PAGE_TABLE_SIZE{1 << PAGE_TABLE_BITS}; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp new file mode 100644 index 000000000..b68f4fb13 --- /dev/null +++ b/src/video_core/morton.cpp @@ -0,0 +1,352 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <cstring> +#include "common/assert.h" +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/morton.h" +#include "video_core/surface.h" +#include "video_core/textures/decoders.h" + +namespace VideoCore { + +using Surface::GetBytesPerPixel; +using Surface::PixelFormat; + +using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, std::size_t, VAddr); +using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>; + +template <bool morton_to_linear, PixelFormat format> +static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth, + u32 tile_width_spacing, u8* buffer, std::size_t buffer_size, VAddr addr) { + constexpr u32 bytes_per_pixel = GetBytesPerPixel(format); + + // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual + // pixel values. + const u32 tile_size_x{GetDefaultBlockWidth(format)}; + const u32 tile_size_y{GetDefaultBlockHeight(format)}; + + if constexpr (morton_to_linear) { + Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel, + stride, height, depth, block_height, block_depth, + tile_width_spacing); + } else { + Tegra::Texture::CopySwizzledData( + (stride + tile_size_x - 1) / tile_size_x, (height + tile_size_y - 1) / tile_size_y, + depth, bytes_per_pixel, bytes_per_pixel, Memory::GetPointer(addr), buffer, false, + block_height, block_depth, tile_width_spacing); + } +} + +static constexpr ConversionArray morton_to_linear_fns = { + // clang-format off + MortonCopy<true, PixelFormat::ABGR8U>, + MortonCopy<true, PixelFormat::ABGR8S>, + MortonCopy<true, PixelFormat::ABGR8UI>, + MortonCopy<true, PixelFormat::B5G6R5U>, + MortonCopy<true, PixelFormat::A2B10G10R10U>, + MortonCopy<true, PixelFormat::A1B5G5R5U>, + MortonCopy<true, PixelFormat::R8U>, + MortonCopy<true, PixelFormat::R8UI>, + MortonCopy<true, PixelFormat::RGBA16F>, + MortonCopy<true, PixelFormat::RGBA16U>, + MortonCopy<true, PixelFormat::RGBA16UI>, + MortonCopy<true, PixelFormat::R11FG11FB10F>, + MortonCopy<true, PixelFormat::RGBA32UI>, + MortonCopy<true, PixelFormat::DXT1>, + MortonCopy<true, PixelFormat::DXT23>, + MortonCopy<true, PixelFormat::DXT45>, + MortonCopy<true, PixelFormat::DXN1>, + MortonCopy<true, PixelFormat::DXN2UNORM>, + MortonCopy<true, PixelFormat::DXN2SNORM>, + MortonCopy<true, PixelFormat::BC7U>, + MortonCopy<true, PixelFormat::BC6H_UF16>, + MortonCopy<true, PixelFormat::BC6H_SF16>, + MortonCopy<true, PixelFormat::ASTC_2D_4X4>, + MortonCopy<true, PixelFormat::BGRA8>, + MortonCopy<true, PixelFormat::RGBA32F>, + MortonCopy<true, PixelFormat::RG32F>, + MortonCopy<true, PixelFormat::R32F>, + MortonCopy<true, PixelFormat::R16F>, + MortonCopy<true, PixelFormat::R16U>, + MortonCopy<true, PixelFormat::R16S>, + MortonCopy<true, PixelFormat::R16UI>, + MortonCopy<true, PixelFormat::R16I>, + MortonCopy<true, PixelFormat::RG16>, + MortonCopy<true, PixelFormat::RG16F>, + MortonCopy<true, PixelFormat::RG16UI>, + MortonCopy<true, PixelFormat::RG16I>, + MortonCopy<true, PixelFormat::RG16S>, + MortonCopy<true, PixelFormat::RGB32F>, + MortonCopy<true, PixelFormat::RGBA8_SRGB>, + MortonCopy<true, PixelFormat::RG8U>, + MortonCopy<true, PixelFormat::RG8S>, + MortonCopy<true, PixelFormat::RG32UI>, + MortonCopy<true, PixelFormat::R32UI>, + MortonCopy<true, PixelFormat::ASTC_2D_8X8>, + MortonCopy<true, PixelFormat::ASTC_2D_8X5>, + MortonCopy<true, PixelFormat::ASTC_2D_5X4>, + MortonCopy<true, PixelFormat::BGRA8_SRGB>, + MortonCopy<true, PixelFormat::DXT1_SRGB>, + MortonCopy<true, PixelFormat::DXT23_SRGB>, + MortonCopy<true, PixelFormat::DXT45_SRGB>, + MortonCopy<true, PixelFormat::BC7U_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_5X5>, + MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>, + MortonCopy<true, PixelFormat::ASTC_2D_10X8>, + MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>, + MortonCopy<true, PixelFormat::Z32F>, + MortonCopy<true, PixelFormat::Z16>, + MortonCopy<true, PixelFormat::Z24S8>, + MortonCopy<true, PixelFormat::S8Z24>, + MortonCopy<true, PixelFormat::Z32FS8>, + // clang-format on +}; + +static constexpr ConversionArray linear_to_morton_fns = { + // clang-format off + MortonCopy<false, PixelFormat::ABGR8U>, + MortonCopy<false, PixelFormat::ABGR8S>, + MortonCopy<false, PixelFormat::ABGR8UI>, + MortonCopy<false, PixelFormat::B5G6R5U>, + MortonCopy<false, PixelFormat::A2B10G10R10U>, + MortonCopy<false, PixelFormat::A1B5G5R5U>, + MortonCopy<false, PixelFormat::R8U>, + MortonCopy<false, PixelFormat::R8UI>, + MortonCopy<false, PixelFormat::RGBA16F>, + MortonCopy<false, PixelFormat::RGBA16U>, + MortonCopy<false, PixelFormat::RGBA16UI>, + MortonCopy<false, PixelFormat::R11FG11FB10F>, + MortonCopy<false, PixelFormat::RGBA32UI>, + MortonCopy<false, PixelFormat::DXT1>, + MortonCopy<false, PixelFormat::DXT23>, + MortonCopy<false, PixelFormat::DXT45>, + MortonCopy<false, PixelFormat::DXN1>, + MortonCopy<false, PixelFormat::DXN2UNORM>, + MortonCopy<false, PixelFormat::DXN2SNORM>, + MortonCopy<false, PixelFormat::BC7U>, + MortonCopy<false, PixelFormat::BC6H_UF16>, + MortonCopy<false, PixelFormat::BC6H_SF16>, + // TODO(Subv): Swizzling ASTC formats are not supported + nullptr, + MortonCopy<false, PixelFormat::BGRA8>, + MortonCopy<false, PixelFormat::RGBA32F>, + MortonCopy<false, PixelFormat::RG32F>, + MortonCopy<false, PixelFormat::R32F>, + MortonCopy<false, PixelFormat::R16F>, + MortonCopy<false, PixelFormat::R16U>, + MortonCopy<false, PixelFormat::R16S>, + MortonCopy<false, PixelFormat::R16UI>, + MortonCopy<false, PixelFormat::R16I>, + MortonCopy<false, PixelFormat::RG16>, + MortonCopy<false, PixelFormat::RG16F>, + MortonCopy<false, PixelFormat::RG16UI>, + MortonCopy<false, PixelFormat::RG16I>, + MortonCopy<false, PixelFormat::RG16S>, + MortonCopy<false, PixelFormat::RGB32F>, + MortonCopy<false, PixelFormat::RGBA8_SRGB>, + MortonCopy<false, PixelFormat::RG8U>, + MortonCopy<false, PixelFormat::RG8S>, + MortonCopy<false, PixelFormat::RG32UI>, + MortonCopy<false, PixelFormat::R32UI>, + nullptr, + nullptr, + nullptr, + MortonCopy<false, PixelFormat::BGRA8_SRGB>, + MortonCopy<false, PixelFormat::DXT1_SRGB>, + MortonCopy<false, PixelFormat::DXT23_SRGB>, + MortonCopy<false, PixelFormat::DXT45_SRGB>, + MortonCopy<false, PixelFormat::BC7U_SRGB>, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + MortonCopy<false, PixelFormat::Z32F>, + MortonCopy<false, PixelFormat::Z16>, + MortonCopy<false, PixelFormat::Z24S8>, + MortonCopy<false, PixelFormat::S8Z24>, + MortonCopy<false, PixelFormat::Z32FS8>, + // clang-format on +}; + +static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) { + switch (mode) { + case MortonSwizzleMode::MortonToLinear: + return morton_to_linear_fns[static_cast<std::size_t>(format)]; + case MortonSwizzleMode::LinearToMorton: + return linear_to_morton_fns[static_cast<std::size_t>(format)]; + } + UNREACHABLE(); + return morton_to_linear_fns[static_cast<std::size_t>(format)]; +} + +/// 8x8 Z-Order coordinate from 2D coordinates +static u32 MortonInterleave(u32 x, u32 y) { + static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15}; + static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a}; + return xlut[x % 8] + ylut[y % 8]; +} + +/// Calculates the offset of the position of the pixel in Morton order +static u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) { + // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each + // of which is composed of four 2x2 subtiles each of which is composed of four texels. + // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g. + // texels are laid out in a 2x2 subtile like this: + // 2 3 + // 0 1 + // + // The full 8x8 tile has the texels arranged like this: + // + // 42 43 46 47 58 59 62 63 + // 40 41 44 45 56 57 60 61 + // 34 35 38 39 50 51 54 55 + // 32 33 36 37 48 49 52 53 + // 10 11 14 15 26 27 30 31 + // 08 09 12 13 24 25 28 29 + // 02 03 06 07 18 19 22 23 + // 00 01 04 05 16 17 20 21 + // + // This pattern is what's called Z-order curve, or Morton order. + + const unsigned int block_height = 8; + const unsigned int coarse_x = x & ~7; + + u32 i = MortonInterleave(x, y); + + const unsigned int offset = coarse_x * block_height; + + return (i + offset) * bytes_per_pixel; +} + +static u32 MortonInterleave128(u32 x, u32 y) { + // 128x128 Z-Order coordinate from 2D coordinates + static constexpr u32 xlut[] = { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042, + 0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809, + 0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000, + 0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043, + 0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a, + 0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001, + 0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, + 0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b, + 0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002, + 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049, + 0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840, + 0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003, + 0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a, + 0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841, + 0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008, + 0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b, + 0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842, + 0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009, + 0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800, + 0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843, + 0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, + 0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801, + 0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848, + 0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802, + 0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849, + 0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040, + 0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803, + 0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a, + 0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041, + 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808, + 0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b, + 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042, + 0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809, + 0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b, + }; + static constexpr u32 ylut[] = { + 0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090, + 0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124, + 0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200, + 0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294, + 0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330, + 0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404, + 0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0, + 0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534, + 0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610, + 0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4, + 0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780, + 0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014, + 0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0, + 0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184, + 0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220, + 0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4, + 0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390, + 0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424, + 0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500, + 0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594, + 0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630, + 0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704, + 0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0, + 0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034, + 0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110, + 0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4, + 0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280, + 0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314, + 0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0, + 0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484, + 0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520, + 0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4, + 0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690, + 0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724, + 0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4, + }; + return xlut[x % 128] + ylut[y % 128]; +} + +static u32 GetMortonOffset128(u32 x, u32 y, u32 bytes_per_pixel) { + // Calculates the offset of the position of the pixel in Morton order + // Framebuffer images are split into 128x128 tiles. + + constexpr u32 block_height = 128; + const u32 coarse_x = x & ~127; + + const u32 i = MortonInterleave128(x, y); + + const u32 offset = coarse_x * block_height; + + return (i + offset) * bytes_per_pixel; +} + +void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride, + u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, + u8* buffer, std::size_t buffer_size, VAddr addr) { + + GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth, + tile_width_spacing, buffer, buffer_size, addr); +} + +void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixel, u32 linear_bytes_per_pixel, + u8* morton_data, u8* linear_data, bool morton_to_linear) { + u8* data_ptrs[2]; + for (u32 y = 0; y < height; ++y) { + for (u32 x = 0; x < width; ++x) { + const u32 coarse_y = y & ~127; + const u32 morton_offset = + GetMortonOffset128(x, y, bytes_per_pixel) + coarse_y * width * bytes_per_pixel; + const u32 linear_pixel_index = (x + y * width) * linear_bytes_per_pixel; + + data_ptrs[morton_to_linear ? 1 : 0] = morton_data + morton_offset; + data_ptrs[morton_to_linear ? 0 : 1] = &linear_data[linear_pixel_index]; + + std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); + } + } +} + +} // namespace VideoCore diff --git a/src/video_core/morton.h b/src/video_core/morton.h new file mode 100644 index 000000000..065f59ce3 --- /dev/null +++ b/src/video_core/morton.h @@ -0,0 +1,21 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" +#include "video_core/surface.h" + +namespace VideoCore { + +enum class MortonSwizzleMode { MortonToLinear, LinearToMorton }; + +void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride, + u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, + u8* buffer, std::size_t buffer_size, VAddr addr); + +void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixel, u32 linear_bytes_per_pixel, + u8* morton_data, u8* linear_data, bool morton_to_linear); + +} // namespace VideoCore diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp new file mode 100644 index 000000000..093b2cdf4 --- /dev/null +++ b/src/video_core/rasterizer_cache.cpp @@ -0,0 +1,7 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "video_core/rasterizer_cache.h" + +RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h index 0a3b3951e..bcf0c15a4 100644 --- a/src/video_core/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache.h @@ -5,18 +5,19 @@ #pragma once #include <set> +#include <unordered_map> #include <boost/icl/interval_map.hpp> #include <boost/range/iterator_range_core.hpp> #include "common/common_types.h" -#include "core/core.h" #include "core/settings.h" #include "video_core/rasterizer_interface.h" -#include "video_core/renderer_base.h" class RasterizerCacheObject { public: + virtual ~RasterizerCacheObject(); + /// Gets the address of the shader in guest memory, required for cache management virtual VAddr GetAddr() const = 0; @@ -64,6 +65,8 @@ class RasterizerCache : NonCopyable { friend class RasterizerCacheObject; public: + explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} + /// Write any cached resources overlapping the specified region back to memory void FlushRegion(Tegra::GPUVAddr addr, size_t size) { const auto& objects{GetSortedObjectsFromRegion(addr, size)}; @@ -86,45 +89,39 @@ public: /// Invalidates everything in the cache void InvalidateAll() { - while (object_cache.begin() != object_cache.end()) { - Unregister(*object_cache.begin()->second.begin()); + while (interval_cache.begin() != interval_cache.end()) { + Unregister(*interval_cache.begin()->second.begin()); } } protected: /// Tries to get an object from the cache with the specified address T TryGet(VAddr addr) const { - const ObjectInterval interval{addr}; - for (auto& pair : boost::make_iterator_range(object_cache.equal_range(interval))) { - for (auto& cached_object : pair.second) { - if (cached_object->GetAddr() == addr) { - return cached_object; - } - } - } + const auto iter = map_cache.find(addr); + if (iter != map_cache.end()) + return iter->second; return nullptr; } /// Register an object into the cache void Register(const T& object) { object->SetIsRegistered(true); - object_cache.add({GetInterval(object), ObjectSet{object}}); - auto& rasterizer = Core::System::GetInstance().Renderer().Rasterizer(); + interval_cache.add({GetInterval(object), ObjectSet{object}}); + map_cache.insert({object->GetAddr(), object}); rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), 1); } /// Unregisters an object from the cache void Unregister(const T& object) { object->SetIsRegistered(false); - auto& rasterizer = Core::System::GetInstance().Renderer().Rasterizer(); rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1); - // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit if (Settings::values.use_accurate_gpu_emulation) { FlushObject(object); } - object_cache.subtract({GetInterval(object), ObjectSet{object}}); + interval_cache.subtract({GetInterval(object), ObjectSet{object}}); + map_cache.erase(object->GetAddr()); } /// Returns a ticks counter used for tracking when cached objects were last modified @@ -141,7 +138,7 @@ private: std::vector<T> objects; const ObjectInterval interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(object_cache.equal_range(interval))) { + for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { for (auto& cached_object : pair.second) { if (!cached_object) { continue; @@ -167,14 +164,17 @@ private: } using ObjectSet = std::set<T>; - using ObjectCache = boost::icl::interval_map<VAddr, ObjectSet>; - using ObjectInterval = typename ObjectCache::interval_type; + using ObjectCache = std::unordered_map<VAddr, T>; + using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; + using ObjectInterval = typename IntervalCache::interval_type; static auto GetInterval(const T& object) { return ObjectInterval::right_open(object->GetAddr(), object->GetAddr() + object->GetSizeInBytes()); } - ObjectCache object_cache; ///< Cache of objects - u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing + ObjectCache map_cache; + IntervalCache interval_cache; ///< Cache of objects + u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing + VideoCore::RasterizerInterface& rasterizer; }; diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp index 0df3725c2..94223f45f 100644 --- a/src/video_core/renderer_base.cpp +++ b/src/video_core/renderer_base.cpp @@ -5,7 +5,6 @@ #include "core/frontend/emu_window.h" #include "core/settings.h" #include "video_core/renderer_base.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" namespace VideoCore { @@ -28,4 +27,16 @@ void RendererBase::UpdateCurrentFramebufferLayout() { render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height); } +void RendererBase::RequestScreenshot(void* data, std::function<void()> callback, + const Layout::FramebufferLayout& layout) { + if (renderer_settings.screenshot_requested) { + LOG_ERROR(Render, "A screenshot is already requested or in progress, ignoring the request"); + return; + } + renderer_settings.screenshot_bits = data; + renderer_settings.screenshot_complete_callback = std::move(callback); + renderer_settings.screenshot_framebuffer_layout = layout; + renderer_settings.screenshot_requested = true; +} + } // namespace VideoCore diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 2cd0738ff..1d54c3723 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -6,8 +6,10 @@ #include <atomic> #include <memory> -#include <boost/optional.hpp> +#include <optional> + #include "common/common_types.h" +#include "core/frontend/emu_window.h" #include "video_core/gpu.h" #include "video_core/rasterizer_interface.h" @@ -20,6 +22,12 @@ namespace VideoCore { struct RendererSettings { std::atomic_bool use_framelimiter{false}; std::atomic_bool set_background_color{false}; + + // Screenshot + std::atomic<bool> screenshot_requested{false}; + void* screenshot_bits; + std::function<void()> screenshot_complete_callback; + Layout::FramebufferLayout screenshot_framebuffer_layout; }; class RendererBase : NonCopyable { @@ -28,7 +36,8 @@ public: virtual ~RendererBase(); /// Swap buffers (render frame) - virtual void SwapBuffers(boost::optional<const Tegra::FramebufferConfig&> framebuffer) = 0; + virtual void SwapBuffers( + std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0; /// Initialize the renderer virtual bool Init() = 0; @@ -55,9 +64,29 @@ public: return *rasterizer; } + Core::Frontend::EmuWindow& GetRenderWindow() { + return render_window; + } + + const Core::Frontend::EmuWindow& GetRenderWindow() const { + return render_window; + } + + RendererSettings& Settings() { + return renderer_settings; + } + + const RendererSettings& Settings() const { + return renderer_settings; + } + /// Refreshes the settings common to all renderers void RefreshBaseSettings(); + /// Request a screenshot of the next frame + void RequestScreenshot(void* data, std::function<void()> callback, + const Layout::FramebufferLayout& layout); + protected: Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. std::unique_ptr<RasterizerInterface> rasterizer; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index c142095c5..46a6c0308 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -9,15 +9,17 @@ #include "core/core.h" #include "core/memory.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" namespace OpenGL { -OGLBufferCache::OGLBufferCache(std::size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {} +OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) + : RasterizerCache{rasterizer}, stream_buffer(GL_ARRAY_BUFFER, size) {} GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment, bool cache) { auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); - const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; + const std::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; // Cache management is a big overhead, so only cache entries with a given size. // TODO: Figure out which size is the best for given games. @@ -74,7 +76,7 @@ std::tuple<u8*, GLintptr> OGLBufferCache::ReserveMemory(std::size_t size, std::s return std::make_tuple(uploaded_ptr, uploaded_offset); } -void OGLBufferCache::Map(std::size_t max_size) { +bool OGLBufferCache::Map(std::size_t max_size) { bool invalidate; std::tie(buffer_ptr, buffer_offset_base, invalidate) = stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4); @@ -83,6 +85,7 @@ void OGLBufferCache::Map(std::size_t max_size) { if (invalidate) { InvalidateAll(); } + return invalidate; } void OGLBufferCache::Unmap() { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index be29dc8be..c11acfb79 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -15,6 +15,8 @@ namespace OpenGL { +class RasterizerOpenGL; + struct CachedBufferEntry final : public RasterizerCacheObject { VAddr GetAddr() const override { return addr; @@ -35,7 +37,7 @@ struct CachedBufferEntry final : public RasterizerCacheObject { class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { public: - explicit OGLBufferCache(std::size_t size); + explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size); /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been /// allocated. @@ -48,7 +50,7 @@ public: /// Reserves memory to be used by host's CPU. Returns mapped address and offset. std::tuple<u8*, GLintptr> ReserveMemory(std::size_t size, std::size_t alignment = 4); - void Map(std::size_t max_size); + bool Map(std::size_t max_size); void Unmap(); GLuint GetHandle() const; diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp index ee1d9601b..d9ed08437 100644 --- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp +++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp @@ -6,6 +6,7 @@ #include <array> #include "common/assert.h" #include "common/common_types.h" +#include "core/core.h" #include "core/memory.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_primitive_assembler.h" @@ -45,7 +46,7 @@ GLintptr PrimitiveAssembler::MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size); auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); - const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; + const std::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; const u8* source{Memory::GetPointer(*cpu_addr)}; for (u32 primitive = 0; primitive < count / 4; ++primitive) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index cb180b93c..089daf96f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,10 +30,11 @@ namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using PixelFormat = SurfaceParams::PixelFormat; -using SurfaceType = SurfaceParams::SurfaceType; +using PixelFormat = VideoCore::Surface::PixelFormat; +using SurfaceType = VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192)); @@ -78,64 +79,79 @@ struct DrawParameters { } }; +struct FramebufferCacheKey { + bool is_single_buffer = false; + bool stencil_enable = false; + + std::array<GLenum, Maxwell::NumRenderTargets> color_attachments{}; + std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors{}; + u32 colors_count = 0; + + GLuint zeta = 0; + + auto Tie() const { + return std::tie(is_single_buffer, stencil_enable, color_attachments, colors, colors_count, + zeta); + } + + bool operator<(const FramebufferCacheKey& rhs) const { + return Tie() < rhs.Tie(); + } +}; + RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info) - : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) { + : res_cache{*this}, shader_cache{*this}, emu_window{window}, screen_info{info}, + buffer_cache(*this, STREAM_BUFFER_SIZE) { // Create sampler objects for (std::size_t i = 0; i < texture_samplers.size(); ++i) { texture_samplers[i].Create(); state.texture_units[i].sampler = texture_samplers[i].sampler.handle; } - GLint ext_num; - glGetIntegerv(GL_NUM_EXTENSIONS, &ext_num); - for (GLint i = 0; i < ext_num; i++) { - const std::string_view extension{ - reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i))}; - - if (extension == "GL_ARB_direct_state_access") { - has_ARB_direct_state_access = true; - } else if (extension == "GL_ARB_multi_bind") { - has_ARB_multi_bind = true; - } else if (extension == "GL_ARB_separate_shader_objects") { - has_ARB_separate_shader_objects = true; - } else if (extension == "GL_ARB_vertex_attrib_binding") { - has_ARB_vertex_attrib_binding = true; - } - } - - ASSERT_MSG(has_ARB_separate_shader_objects, "has_ARB_separate_shader_objects is unsupported"); - - // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0 - state.clip_distance[0] = true; - - // Create render framebuffer - framebuffer.Create(); + OpenGLState::ApplyDefaultState(); shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); - glEnable(GL_BLEND); - glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); LOG_CRITICAL(Render_OpenGL, "Sync fixed function OpenGL state here!"); + CheckExtensions(); } RasterizerOpenGL::~RasterizerOpenGL() {} -void RasterizerOpenGL::SetupVertexArrays() { - MICROPROFILE_SCOPE(OpenGL_VAO); - const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); +void RasterizerOpenGL::CheckExtensions() { + if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { + LOG_WARNING( + Render_OpenGL, + "Anisotropic filter is not supported! This can cause graphical issues in some games."); + } + if (!GLAD_GL_ARB_buffer_storage) { + LOG_WARNING( + Render_OpenGL, + "Buffer storage control is not supported! This can cause performance degradation."); + } +} + +void RasterizerOpenGL::SetupVertexFormat() { + auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); const auto& regs = gpu.regs; + if (!gpu.dirty_flags.vertex_attrib_format) + return; + gpu.dirty_flags.vertex_attrib_format = false; + + MICROPROFILE_SCOPE(OpenGL_VAO); + auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format); auto& VAO = iter->second; if (is_cache_miss) { VAO.Create(); state.draw.vertex_array = VAO.handle; - state.Apply(); + state.ApplyVertexBufferState(); // The index buffer binding is stored within the VAO. Stupid OpenGL, but easy to work // around. @@ -177,11 +193,26 @@ void RasterizerOpenGL::SetupVertexArrays() { } } state.draw.vertex_array = VAO.handle; - state.draw.vertex_buffer = buffer_cache.GetHandle(); - state.Apply(); + state.ApplyVertexBufferState(); + + // Rebinding the VAO invalidates the vertex buffer bindings. + gpu.dirty_flags.vertex_array = 0xFFFFFFFF; +} + +void RasterizerOpenGL::SetupVertexBuffer() { + auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + if (!gpu.dirty_flags.vertex_array) + return; + + MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (~gpu.dirty_flags.vertex_array & (1u << index)) + continue; + const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) continue; @@ -205,6 +236,11 @@ void RasterizerOpenGL::SetupVertexArrays() { glVertexBindingDivisor(index, 0); } } + + // Implicit set by glBindVertexBuffer. Stupid glstate handling... + state.draw.vertex_buffer = buffer_cache.GetHandle(); + + gpu.dirty_flags.vertex_array = 0; } DrawParameters RasterizerOpenGL::SetupDraw() { @@ -263,6 +299,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points. u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage; u32 current_texture_bindpoint = 0; + std::array<bool, Maxwell::NumClipDistances> clip_distances{}; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; @@ -323,6 +360,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader, primitive_mode, current_texture_bindpoint); + // Workaround for Intel drivers. + // When a clip distance is enabled but not set in the shader it crops parts of the screen + // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the + // clip distances only when it's written by a shader stage. + for (std::size_t i = 0; i < Maxwell::NumClipDistances; ++i) { + clip_distances[i] |= shader->GetShaderEntries().clip_distances[i]; + } + // When VertexA is enabled, we have dual vertex shaders if (program == Maxwell::ShaderProgram::VertexA) { // VertexB was combined with VertexA, so we skip the VertexB iteration @@ -330,7 +375,45 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } } - state.Apply(); + SyncClipEnabled(clip_distances); +} + +void RasterizerOpenGL::SetupCachedFramebuffer(const FramebufferCacheKey& fbkey, + OpenGLState& current_state) { + const auto [entry, is_cache_miss] = framebuffer_cache.try_emplace(fbkey); + auto& framebuffer = entry->second; + + if (is_cache_miss) + framebuffer.Create(); + + current_state.draw.draw_framebuffer = framebuffer.handle; + current_state.ApplyFramebufferState(); + + if (!is_cache_miss) + return; + + if (fbkey.is_single_buffer) { + if (fbkey.color_attachments[0] != GL_NONE) { + glFramebufferTexture(GL_DRAW_FRAMEBUFFER, fbkey.color_attachments[0], fbkey.colors[0], + 0); + } + glDrawBuffer(fbkey.color_attachments[0]); + } else { + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + if (fbkey.colors[index]) { + glFramebufferTexture(GL_DRAW_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), + fbkey.colors[index], 0); + } + } + glDrawBuffers(fbkey.colors_count, fbkey.color_attachments.data()); + } + + if (fbkey.zeta) { + GLenum zeta_attachment = + fbkey.stencil_enable ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT; + glFramebufferTexture(GL_DRAW_FRAMEBUFFER, zeta_attachment, fbkey.zeta, 0); + } } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -399,9 +482,9 @@ void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { cached_pages.add({pages_interval, delta}); } -void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb, - bool preserve_contents, - boost::optional<std::size_t> single_color_target) { +void RasterizerOpenGL::ConfigureFramebuffers(OpenGLState& current_state, bool using_color_fb, + bool using_depth_fb, bool preserve_contents, + std::optional<std::size_t> single_color_target) { MICROPROFILE_SCOPE(OpenGL_Framebuffer); const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; @@ -413,12 +496,12 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep // TODO(bunnei): Figure out how the below register works. According to envytools, this should be // used to enable multiple render targets. However, it is left unset on all games that I have // tested. - ASSERT_MSG(regs.rt_separate_frag_data == 0, "Unimplemented"); + UNIMPLEMENTED_IF(regs.rt_separate_frag_data != 0); // Bind the framebuffer surfaces - state.draw.draw_framebuffer = framebuffer.handle; - state.Apply(); - state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; + current_state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; + + FramebufferCacheKey fbkey; if (using_color_fb) { if (single_color_target) { @@ -435,14 +518,12 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep state.framebuffer_srgb.enabled |= color_surface->GetSurfaceParams().srgb_conversion; } - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, - GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target), GL_TEXTURE_2D, - color_surface != nullptr ? color_surface->Texture().handle : 0, 0); - glDrawBuffer(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target)); + fbkey.is_single_buffer = true; + fbkey.color_attachments[0] = + GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target); + fbkey.colors[0] = color_surface != nullptr ? color_surface->Texture().handle : 0; } else { // Multiple color attachments are enabled - std::array<GLenum, Maxwell::NumRenderTargets> buffers; for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents); @@ -457,22 +538,17 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep color_surface->GetSurfaceParams().srgb_conversion; } - buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), - GL_TEXTURE_2D, color_surface != nullptr ? color_surface->Texture().handle : 0, - 0); + fbkey.color_attachments[index] = + GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); + fbkey.colors[index] = + color_surface != nullptr ? color_surface->Texture().handle : 0; } - glDrawBuffers(regs.rt_control.count, buffers.data()); + fbkey.is_single_buffer = false; + fbkey.colors_count = regs.rt_control.count; } } else { - // No color attachments are enabled - zero out all of them - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, - GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), GL_TEXTURE_2D, - 0, 0); - } - glDrawBuffer(GL_NONE); + // No color attachments are enabled - leave them as zero + fbkey.is_single_buffer = true; } if (depth_surface) { @@ -480,26 +556,13 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep // the shader doesn't actually write to it. depth_surface->MarkAsModified(true, res_cache); - if (regs.stencil_enable) { - // Attach both depth and stencil - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->Texture().handle, 0); - } else { - // Attach depth - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->Texture().handle, 0); - // Clear stencil attachment - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - } - } else { - // Clear both depth and stencil attachment - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); + fbkey.zeta = depth_surface->Texture().handle; + fbkey.stencil_enable = regs.stencil_enable; } - SyncViewport(); + SetupCachedFramebuffer(fbkey, current_state); - state.Apply(); + SyncViewport(current_state); } void RasterizerOpenGL::Clear() { @@ -512,22 +575,23 @@ void RasterizerOpenGL::Clear() { bool use_stencil{}; OpenGLState clear_state; - clear_state.draw.draw_framebuffer = framebuffer.handle; - clear_state.color_mask.red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE; - clear_state.color_mask.green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE; - clear_state.color_mask.blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE; - clear_state.color_mask.alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE; - if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; } + if (use_color) { + clear_state.color_mask[0].red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE; + clear_state.color_mask[0].green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE; + clear_state.color_mask[0].blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE; + clear_state.color_mask[0].alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE; + } if (regs.clear_buffers.Z) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!"); use_depth = true; // Always enable the depth write when clearing the depth buffer. The depth write mask is - // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to true. + // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to + // true. clear_state.depth.test_enabled = true; clear_state.depth.test_func = GL_ALWAYS; } @@ -535,6 +599,30 @@ void RasterizerOpenGL::Clear() { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); use_stencil = true; clear_state.stencil.test_enabled = true; + if (regs.clear_flags.stencil) { + // Stencil affects the clear so fill it with the used masks + clear_state.stencil.front.test_func = GL_ALWAYS; + clear_state.stencil.front.test_mask = regs.stencil_front_func_mask; + clear_state.stencil.front.action_stencil_fail = GL_KEEP; + clear_state.stencil.front.action_depth_fail = GL_KEEP; + clear_state.stencil.front.action_depth_pass = GL_KEEP; + clear_state.stencil.front.write_mask = regs.stencil_front_mask; + if (regs.stencil_two_side_enable) { + clear_state.stencil.back.test_func = GL_ALWAYS; + clear_state.stencil.back.test_mask = regs.stencil_back_func_mask; + clear_state.stencil.back.action_stencil_fail = GL_KEEP; + clear_state.stencil.back.action_depth_fail = GL_KEEP; + clear_state.stencil.back.action_depth_pass = GL_KEEP; + clear_state.stencil.back.write_mask = regs.stencil_back_mask; + } else { + clear_state.stencil.back.test_func = GL_ALWAYS; + clear_state.stencil.back.test_mask = 0xFFFFFFFF; + clear_state.stencil.back.write_mask = 0xFFFFFFFF; + clear_state.stencil.back.action_stencil_fail = GL_KEEP; + clear_state.stencil.back.action_depth_fail = GL_KEEP; + clear_state.stencil.back.action_depth_pass = GL_KEEP; + } + } } if (!use_color && !use_depth && !use_stencil) { @@ -544,11 +632,16 @@ void RasterizerOpenGL::Clear() { ScopeAcquireGLContext acquire_context{emu_window}; - ConfigureFramebuffers(use_color, use_depth || use_stencil, false, + ConfigureFramebuffers(clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value()); - // Copy the sRGB setting to the clear state to avoid problem with - // specific driver implementations - clear_state.framebuffer_srgb.enabled = state.framebuffer_srgb.enabled; + if (regs.clear_flags.scissor) { + SyncScissorTest(clear_state); + } + + if (regs.clear_flags.viewport) { + clear_state.EmulateViewportWithScissor(); + } + clear_state.Apply(); if (use_color) { @@ -569,26 +662,27 @@ void RasterizerOpenGL::DrawArrays() { return; MICROPROFILE_SCOPE(OpenGL_Drawing); - const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); + auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); const auto& regs = gpu.regs; ScopeAcquireGLContext acquire_context{emu_window}; - ConfigureFramebuffers(); - + ConfigureFramebuffers(state); + SyncColorMask(); + SyncFragmentColorClampState(); + SyncMultiSampleState(); SyncDepthTestState(); SyncStencilTestState(); SyncBlendState(); SyncLogicOpState(); SyncCullMode(); SyncPrimitiveRestart(); - SyncDepthRange(); - SyncScissorTest(); + SyncScissorTest(state); // Alpha Testing is synced on shaders. SyncTransformFeedback(); SyncPointState(); CheckAlphaTests(); - + SyncPolygonOffset(); // TODO(bunnei): Sync framebuffer_scale uniform here // TODO(bunnei): Sync scissorbox uniform(s) here @@ -596,7 +690,7 @@ void RasterizerOpenGL::DrawArrays() { const bool is_indexed = accelerate_draw == AccelDraw::Indexed; state.draw.vertex_buffer = buffer_cache.GetHandle(); - state.Apply(); + state.ApplyVertexBufferState(); std::size_t buffer_size = CalculateVertexArraysSize(); @@ -621,9 +715,14 @@ void RasterizerOpenGL::DrawArrays() { // Add space for at least 18 constant buffers buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment); - buffer_cache.Map(buffer_size); + bool invalidate = buffer_cache.Map(buffer_size); + if (invalidate) { + // As all cached buffers are invalidated, we need to recheck their state. + gpu.dirty_flags.vertex_array = 0xFFFFFFFF; + } - SetupVertexArrays(); + SetupVertexFormat(); + SetupVertexBuffer(); DrawParameters params = SetupDraw(); SetupShaders(params.primitive_mode); @@ -636,7 +735,7 @@ void RasterizerOpenGL::DrawArrays() { params.DispatchDraw(); // Disable scissor test - state.scissor.enabled = false; + state.viewports[0].scissor.enabled = false; accelerate_draw = AccelDraw::Disabled; @@ -703,7 +802,8 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, // Verify that the cached surface is the same size and format as the requested framebuffer const auto& params{surface->GetSurfaceParams()}; - const auto& pixel_format{SurfaceParams::PixelFormatFromGPUPixelFormat(config.pixel_format)}; + const auto& pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); ASSERT_MSG(params.pixel_format == pixel_format, "Framebuffer pixel_format is different"); @@ -728,14 +828,17 @@ void RasterizerOpenGL::SamplerInfo::Create() { void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntry& config) { const GLuint s = sampler.handle; - if (mag_filter != config.mag_filter) { mag_filter = config.mag_filter; - glSamplerParameteri(s, GL_TEXTURE_MAG_FILTER, MaxwellToGL::TextureFilterMode(mag_filter)); + glSamplerParameteri( + s, GL_TEXTURE_MAG_FILTER, + MaxwellToGL::TextureFilterMode(mag_filter, Tegra::Texture::TextureMipmapFilter::None)); } - if (min_filter != config.min_filter) { + if (min_filter != config.min_filter || mip_filter != config.mip_filter) { min_filter = config.min_filter; - glSamplerParameteri(s, GL_TEXTURE_MIN_FILTER, MaxwellToGL::TextureFilterMode(min_filter)); + mip_filter = config.mip_filter; + glSamplerParameteri(s, GL_TEXTURE_MIN_FILTER, + MaxwellToGL::TextureFilterMode(min_filter, mip_filter)); } if (wrap_u != config.wrap_u) { @@ -766,15 +869,51 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr MaxwellToGL::DepthCompareFunc(depth_compare_func)); } - if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border || - wrap_p == Tegra::Texture::WrapMode::Border) { - const GLvec4 new_border_color = {{config.border_color_r, config.border_color_g, - config.border_color_b, config.border_color_a}}; - if (border_color != new_border_color) { - border_color = new_border_color; - glSamplerParameterfv(s, GL_TEXTURE_BORDER_COLOR, border_color.data()); + GLvec4 new_border_color; + if (config.srgb_conversion) { + new_border_color[0] = config.srgb_border_color_r / 255.0f; + new_border_color[1] = config.srgb_border_color_g / 255.0f; + new_border_color[2] = config.srgb_border_color_g / 255.0f; + } else { + new_border_color[0] = config.border_color_r; + new_border_color[1] = config.border_color_g; + new_border_color[2] = config.border_color_b; + } + new_border_color[3] = config.border_color_a; + + if (border_color != new_border_color) { + border_color = new_border_color; + glSamplerParameterfv(s, GL_TEXTURE_BORDER_COLOR, border_color.data()); + } + + const float anisotropic_max = static_cast<float>(1 << config.max_anisotropy.Value()); + if (anisotropic_max != max_anisotropic) { + max_anisotropic = anisotropic_max; + if (GLAD_GL_ARB_texture_filter_anisotropic) { + glSamplerParameterf(s, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropic); + } else if (GLAD_GL_EXT_texture_filter_anisotropic) { + glSamplerParameterf(s, GL_TEXTURE_MAX_ANISOTROPY_EXT, max_anisotropic); } } + const float lod_min = static_cast<float>(config.min_lod_clamp.Value()) / 256.0f; + if (lod_min != min_lod) { + min_lod = lod_min; + glSamplerParameterf(s, GL_TEXTURE_MIN_LOD, min_lod); + } + + const float lod_max = static_cast<float>(config.max_lod_clamp.Value()) / 256.0f; + if (lod_max != max_lod) { + max_lod = lod_max; + glSamplerParameterf(s, GL_TEXTURE_MAX_LOD, max_lod); + } + const u32 bias = config.mip_lod_bias.Value(); + // Sign extend the 13-bit value. + constexpr u32 mask = 1U << (13 - 1); + const float bias_lod = static_cast<s32>((bias ^ mask) - mask) / 256.f; + if (lod_bias != bias_lod) { + lod_bias = bias_lod; + glSamplerParameterf(s, GL_TEXTURE_LOD_BIAS, lod_bias); + } } u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader, @@ -875,8 +1014,11 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc); Surface surface = res_cache.GetTextureSurface(texture, entry); if (surface != nullptr) { - state.texture_units[current_bindpoint].texture = surface->Texture().handle; - state.texture_units[current_bindpoint].target = surface->Target(); + const GLuint handle = + entry.IsArray() ? surface->TextureLayer().handle : surface->Texture().handle; + const GLenum target = entry.IsArray() ? surface->TargetLayer() : surface->Target(); + state.texture_units[current_bindpoint].texture = handle; + state.texture_units[current_bindpoint].target = target; state.texture_units[current_bindpoint].swizzle.r = MaxwellToGL::SwizzleSource(texture.tic.x_source); state.texture_units[current_bindpoint].swizzle.g = @@ -894,22 +1036,44 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, return current_unit + static_cast<u32>(entries.size()); } -void RasterizerOpenGL::SyncViewport() { +void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()}; - - state.viewport.x = viewport_rect.left; - state.viewport.y = viewport_rect.bottom; - state.viewport.width = static_cast<GLsizei>(viewport_rect.GetWidth()); - state.viewport.height = static_cast<GLsizei>(viewport_rect.GetHeight()); + const bool geometry_shaders_enabled = + regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); + const std::size_t viewport_count = + geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; + for (std::size_t i = 0; i < viewport_count; i++) { + auto& viewport = current_state.viewports[i]; + const auto& src = regs.viewports[i]; + const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[i].GetRect()}; + viewport.x = viewport_rect.left; + viewport.y = viewport_rect.bottom; + viewport.width = viewport_rect.GetWidth(); + viewport.height = viewport_rect.GetHeight(); + viewport.depth_range_far = regs.viewports[i].depth_range_far; + viewport.depth_range_near = regs.viewports[i].depth_range_near; + } + state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0; + state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0; } -void RasterizerOpenGL::SyncClipEnabled() { - UNREACHABLE(); +void RasterizerOpenGL::SyncClipEnabled( + const std::array<bool, Maxwell::Regs::NumClipDistances>& clip_mask) { + + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + const std::array<bool, Maxwell::Regs::NumClipDistances> reg_state{ + regs.clip_distance_enabled.c0 != 0, regs.clip_distance_enabled.c1 != 0, + regs.clip_distance_enabled.c2 != 0, regs.clip_distance_enabled.c3 != 0, + regs.clip_distance_enabled.c4 != 0, regs.clip_distance_enabled.c5 != 0, + regs.clip_distance_enabled.c6 != 0, regs.clip_distance_enabled.c7 != 0}; + + for (std::size_t i = 0; i < Maxwell::Regs::NumClipDistances; ++i) { + state.clip_distance[i] = reg_state[i] && clip_mask[i]; + } } void RasterizerOpenGL::SyncClipCoef() { - UNREACHABLE(); + UNIMPLEMENTED(); } void RasterizerOpenGL::SyncCullMode() { @@ -943,13 +1107,6 @@ void RasterizerOpenGL::SyncPrimitiveRestart() { state.primitive_restart.index = regs.primitive_restart.index; } -void RasterizerOpenGL::SyncDepthRange() { - const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - - state.depth.depth_range_near = regs.viewport->depth_range_near; - state.depth.depth_range_far = regs.viewport->depth_range_far; -} - void RasterizerOpenGL::SyncDepthTestState() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; @@ -970,9 +1127,6 @@ void RasterizerOpenGL::SyncStencilTestState() { return; } - // TODO(bunnei): Verify behavior when this is not set - ASSERT(regs.stencil_two_side_enable); - state.stencil.front.test_func = MaxwellToGL::ComparisonOp(regs.stencil_front_func_func); state.stencil.front.test_ref = regs.stencil_front_func_ref; state.stencil.front.test_mask = regs.stencil_front_func_mask; @@ -980,42 +1134,95 @@ void RasterizerOpenGL::SyncStencilTestState() { state.stencil.front.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_zfail); state.stencil.front.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_front_op_zpass); state.stencil.front.write_mask = regs.stencil_front_mask; + if (regs.stencil_two_side_enable) { + state.stencil.back.test_func = MaxwellToGL::ComparisonOp(regs.stencil_back_func_func); + state.stencil.back.test_ref = regs.stencil_back_func_ref; + state.stencil.back.test_mask = regs.stencil_back_func_mask; + state.stencil.back.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_fail); + state.stencil.back.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_zfail); + state.stencil.back.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_back_op_zpass); + state.stencil.back.write_mask = regs.stencil_back_mask; + } else { + state.stencil.back.test_func = GL_ALWAYS; + state.stencil.back.test_ref = 0; + state.stencil.back.test_mask = 0xFFFFFFFF; + state.stencil.back.write_mask = 0xFFFFFFFF; + state.stencil.back.action_stencil_fail = GL_KEEP; + state.stencil.back.action_depth_fail = GL_KEEP; + state.stencil.back.action_depth_pass = GL_KEEP; + } +} - state.stencil.back.test_func = MaxwellToGL::ComparisonOp(regs.stencil_back_func_func); - state.stencil.back.test_ref = regs.stencil_back_func_ref; - state.stencil.back.test_mask = regs.stencil_back_func_mask; - state.stencil.back.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_fail); - state.stencil.back.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_zfail); - state.stencil.back.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_back_op_zpass); - state.stencil.back.write_mask = regs.stencil_back_mask; +void RasterizerOpenGL::SyncColorMask() { + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + const std::size_t count = + regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; + for (std::size_t i = 0; i < count; i++) { + const auto& source = regs.color_mask[regs.color_mask_common ? 0 : i]; + auto& dest = state.color_mask[i]; + dest.red_enabled = (source.R == 0) ? GL_FALSE : GL_TRUE; + dest.green_enabled = (source.G == 0) ? GL_FALSE : GL_TRUE; + dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; + dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; + } } -void RasterizerOpenGL::SyncBlendState() { +void RasterizerOpenGL::SyncMultiSampleState() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + state.multisample_control.alpha_to_coverage = regs.multisample_control.alpha_to_coverage != 0; + state.multisample_control.alpha_to_one = regs.multisample_control.alpha_to_one != 0; +} - // TODO(Subv): Support more than just render target 0. - state.blend.enabled = regs.blend.enable[0] != 0; +void RasterizerOpenGL::SyncFragmentColorClampState() { + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + state.fragment_color_clamp.enabled = regs.frag_color_clamp != 0; +} - if (!state.blend.enabled) - return; +void RasterizerOpenGL::SyncBlendState() { + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - ASSERT_MSG(regs.logic_op.enable == 0, - "Blending and logic op can't be enabled at the same time."); + state.blend_color.red = regs.blend_color.r; + state.blend_color.green = regs.blend_color.g; + state.blend_color.blue = regs.blend_color.b; + state.blend_color.alpha = regs.blend_color.a; + + state.independant_blend.enabled = regs.independent_blend_enable; + if (!state.independant_blend.enabled) { + auto& blend = state.blend[0]; + const auto& src = regs.blend; + blend.enabled = src.enable[0] != 0; + if (blend.enabled) { + blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); + blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); + blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); + blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); + blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); + blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); + } + for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { + state.blend[i].enabled = false; + } + return; + } - ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented"); - ASSERT_MSG(!regs.independent_blend[0].separate_alpha, "Unimplemented"); - state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_rgb); - state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_rgb); - state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_rgb); - state.blend.a_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_a); - state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_a); - state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_dest_a); + for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { + auto& blend = state.blend[i]; + const auto& src = regs.independent_blend[i]; + blend.enabled = regs.blend.enable[i] != 0; + if (!blend.enabled) + continue; + blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); + blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); + blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); + blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); + blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); + blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); + } } void RasterizerOpenGL::SyncLogicOpState() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - // TODO(Subv): Support more than just render target 0. state.logic_op.enabled = regs.logic_op.enable != 0; if (!state.logic_op.enabled) @@ -1027,19 +1234,25 @@ void RasterizerOpenGL::SyncLogicOpState() { state.logic_op.operation = MaxwellToGL::LogicOp(regs.logic_op.operation); } -void RasterizerOpenGL::SyncScissorTest() { +void RasterizerOpenGL::SyncScissorTest(OpenGLState& current_state) { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - - state.scissor.enabled = (regs.scissor_test.enable != 0); - // TODO(Blinkhawk): Figure if the hardware supports scissor testing per viewport and how it's - // implemented. - if (regs.scissor_test.enable != 0) { - const u32 width = regs.scissor_test.max_x - regs.scissor_test.min_x; - const u32 height = regs.scissor_test.max_y - regs.scissor_test.min_y; - state.scissor.x = regs.scissor_test.min_x; - state.scissor.y = regs.scissor_test.min_y; - state.scissor.width = width; - state.scissor.height = height; + const bool geometry_shaders_enabled = + regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); + const std::size_t viewport_count = + geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; + for (std::size_t i = 0; i < viewport_count; i++) { + const auto& src = regs.scissor_test[i]; + auto& dst = current_state.viewports[i].scissor; + dst.enabled = (src.enable != 0); + if (dst.enabled == 0) { + return; + } + const u32 width = src.max_x - src.min_x; + const u32 height = src.max_y - src.min_y; + dst.x = src.min_x; + dst.y = src.min_y; + dst.width = width; + dst.height = height; } } @@ -1054,20 +1267,25 @@ void RasterizerOpenGL::SyncTransformFeedback() { void RasterizerOpenGL::SyncPointState() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + state.point.size = regs.point_size; +} - // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a - // register carrying a default value. For now, if the point size is zero, assume it's - // OpenGL's default (1). - state.point.size = regs.point_size == 0 ? 1 : regs.point_size; +void RasterizerOpenGL::SyncPolygonOffset() { + const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; + state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; + state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; + state.polygon_offset.units = regs.polygon_offset_units; + state.polygon_offset.factor = regs.polygon_offset_factor; + state.polygon_offset.clamp = regs.polygon_offset_clamp; } void RasterizerOpenGL::CheckAlphaTests() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; if (regs.alpha_test_enabled != 0 && regs.rt_control.count > 1) { - LOG_CRITICAL( - Render_OpenGL, - "Alpha Testing is enabled with Multiple Render Targets, this behavior is undefined."); + LOG_CRITICAL(Render_OpenGL, "Alpha Testing is enabled with Multiple Render Targets, " + "this behavior is undefined."); UNREACHABLE(); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 5020a5392..8a891ffc7 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -8,12 +8,12 @@ #include <cstddef> #include <map> #include <memory> +#include <optional> #include <tuple> #include <utility> #include <vector> #include <boost/icl/interval_map.hpp> -#include <boost/optional.hpp> #include <boost/range/iterator_range.hpp> #include <glad/glad.h> @@ -40,6 +40,7 @@ namespace OpenGL { struct ScreenInfo; struct DrawParameters; +struct FramebufferCacheKey; class RasterizerOpenGL : public VideoCore::RasterizerInterface { public: @@ -60,20 +61,6 @@ public: bool AccelerateDrawBatch(bool is_indexed) override; void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) override; - /// OpenGL shader generated for a given Maxwell register state - struct MaxwellShader { - /// OpenGL shader resource - OGLProgram shader; - }; - - struct VertexShader { - OGLShader shader; - }; - - struct FragmentShader { - OGLShader shader; - }; - /// Maximum supported size that a constbuffer can have in bytes. static constexpr std::size_t MaxConstbufferSize = 0x10000; static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, @@ -88,17 +75,23 @@ private: /// SamplerInfo struct. void Create(); /// Syncs the sampler object with the config, updating any necessary state. - void SyncWithConfig(const Tegra::Texture::TSCEntry& config); + void SyncWithConfig(const Tegra::Texture::TSCEntry& info); private: - Tegra::Texture::TextureFilter mag_filter; - Tegra::Texture::TextureFilter min_filter; - Tegra::Texture::WrapMode wrap_u; - Tegra::Texture::WrapMode wrap_v; - Tegra::Texture::WrapMode wrap_p; - bool uses_depth_compare; - Tegra::Texture::DepthCompareFunc depth_compare_func; - GLvec4 border_color; + Tegra::Texture::TextureFilter mag_filter = Tegra::Texture::TextureFilter::Nearest; + Tegra::Texture::TextureFilter min_filter = Tegra::Texture::TextureFilter::Nearest; + Tegra::Texture::TextureMipmapFilter mip_filter = Tegra::Texture::TextureMipmapFilter::None; + Tegra::Texture::WrapMode wrap_u = Tegra::Texture::WrapMode::ClampToEdge; + Tegra::Texture::WrapMode wrap_v = Tegra::Texture::WrapMode::ClampToEdge; + Tegra::Texture::WrapMode wrap_p = Tegra::Texture::WrapMode::ClampToEdge; + bool uses_depth_compare = false; + Tegra::Texture::DepthCompareFunc depth_compare_func = + Tegra::Texture::DepthCompareFunc::Always; + GLvec4 border_color = {}; + float min_lod = 0.0f; + float max_lod = 16.0f; + float lod_bias = 0.0f; + float max_anisotropic = 1.0f; }; /** @@ -108,9 +101,9 @@ private: * @param preserve_contents If true, tries to preserve data from a previously used framebuffer. * @param single_color_target Specifies if a single color buffer target should be used. */ - void ConfigureFramebuffers(bool use_color_fb = true, bool using_depth_fb = true, - bool preserve_contents = true, - boost::optional<std::size_t> single_color_target = {}); + void ConfigureFramebuffers(OpenGLState& current_state, bool use_color_fb = true, + bool using_depth_fb = true, bool preserve_contents = true, + std::optional<std::size_t> single_color_target = {}); /* * Configures the current constbuffers to use for the draw command. @@ -132,11 +125,12 @@ private: u32 SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader, GLenum primitive_mode, u32 current_unit); - /// Syncs the viewport to match the guest state - void SyncViewport(); + /// Syncs the viewport and depth range to match the guest state + void SyncViewport(OpenGLState& current_state); /// Syncs the clip enabled status to match the guest state - void SyncClipEnabled(); + void SyncClipEnabled( + const std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances>& clip_mask); /// Syncs the clip coefficients to match the guest state void SyncClipCoef(); @@ -147,9 +141,6 @@ private: /// Syncs the primitve restart to match the guest state void SyncPrimitiveRestart(); - /// Syncs the depth range to match the guest state - void SyncDepthRange(); - /// Syncs the depth test state to match the guest state void SyncDepthTestState(); @@ -162,8 +153,14 @@ private: /// Syncs the LogicOp state to match the guest state void SyncLogicOpState(); + /// Syncs the the color clamp state + void SyncFragmentColorClampState(); + + /// Syncs the alpha coverage and alpha to one + void SyncMultiSampleState(); + /// Syncs the scissor test state to match the guest state - void SyncScissorTest(); + void SyncScissorTest(OpenGLState& current_state); /// Syncs the transform feedback state to match the guest state void SyncTransformFeedback(); @@ -171,13 +168,18 @@ private: /// Syncs the point state to match the guest state void SyncPointState(); + /// Syncs Color Mask + void SyncColorMask(); + + /// Syncs the polygon offsets + void SyncPolygonOffset(); + /// Check asserts for alpha testing. void CheckAlphaTests(); - bool has_ARB_direct_state_access = false; - bool has_ARB_multi_bind = false; - bool has_ARB_separate_shader_objects = false; - bool has_ARB_vertex_attrib_binding = false; + /// Check for extension that are not strictly required + /// but are needed for correct emulation + void CheckExtensions(); OpenGLState state; @@ -194,11 +196,12 @@ private: OGLVertexArray> vertex_array_cache; + std::map<FramebufferCacheKey, OGLFramebuffer> framebuffer_cache; + std::array<SamplerInfo, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> texture_samplers; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; - OGLFramebuffer framebuffer; PrimitiveAssembler primitive_assembler{buffer_cache}; GLint uniform_buffer_alignment; @@ -206,12 +209,15 @@ private: std::size_t CalculateIndexBufferSize() const; - void SetupVertexArrays(); + void SetupVertexFormat(); + void SetupVertexBuffer(); DrawParameters SetupDraw(); void SetupShaders(GLenum primitive_mode); + void SetupCachedFramebuffer(const FramebufferCacheKey& fbkey, OpenGLState& current_state); + enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index c3cf33eed..d3dcb9a46 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -15,16 +15,26 @@ #include "core/memory.h" #include "core/settings.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/morton.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" +#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/utils.h" +#include "video_core/surface.h" #include "video_core/textures/astc.h" #include "video_core/textures/decoders.h" -#include "video_core/utils.h" namespace OpenGL { -using SurfaceType = SurfaceParams::SurfaceType; -using PixelFormat = SurfaceParams::PixelFormat; -using ComponentType = SurfaceParams::ComponentType; +using VideoCore::MortonSwizzle; +using VideoCore::MortonSwizzleMode; +using VideoCore::Surface::ComponentTypeFromDepthFormat; +using VideoCore::Surface::ComponentTypeFromRenderTarget; +using VideoCore::Surface::ComponentTypeFromTexture; +using VideoCore::Surface::PixelFormatFromDepthFormat; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; +using VideoCore::Surface::PixelFormatFromTextureFormat; +using VideoCore::Surface::SurfaceTargetFromTextureType; struct FormatTuple { GLint internal_format; @@ -34,43 +44,14 @@ struct FormatTuple { bool compressed; }; -static bool IsPixelFormatASTC(PixelFormat format) { - switch (format) { - case PixelFormat::ASTC_2D_4X4: - case PixelFormat::ASTC_2D_5X4: - case PixelFormat::ASTC_2D_8X8: - case PixelFormat::ASTC_2D_8X5: - case PixelFormat::ASTC_2D_4X4_SRGB: - case PixelFormat::ASTC_2D_5X4_SRGB: - case PixelFormat::ASTC_2D_8X8_SRGB: - case PixelFormat::ASTC_2D_8X5_SRGB: - return true; - default: - return false; - } -} - -static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { - switch (format) { - case PixelFormat::ASTC_2D_4X4: - return {4, 4}; - case PixelFormat::ASTC_2D_5X4: - return {5, 4}; - case PixelFormat::ASTC_2D_8X8: - return {8, 8}; - case PixelFormat::ASTC_2D_8X5: - return {8, 5}; - case PixelFormat::ASTC_2D_4X4_SRGB: - return {4, 4}; - case PixelFormat::ASTC_2D_5X4_SRGB: - return {5, 4}; - case PixelFormat::ASTC_2D_8X8_SRGB: - return {8, 8}; - case PixelFormat::ASTC_2D_8X5_SRGB: - return {8, 5}; - default: - LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format)); - UNREACHABLE(); +static void ApplyTextureDefaults(GLenum target, u32 max_mip_level) { + glTexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, max_mip_level - 1); + if (max_mip_level == 1) { + glTexParameterf(target, GL_TEXTURE_LOD_BIAS, 1000.0); } } @@ -90,27 +71,34 @@ void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) { } } -std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { - const u32 compression_factor{GetCompressionFactor(pixel_format)}; +std::size_t SurfaceParams::InnerMipmapMemorySize(u32 mip_level, bool force_gl, bool layer_only, + bool uncompressed) const { + const u32 tile_x{GetDefaultBlockWidth(pixel_format)}; + const u32 tile_y{GetDefaultBlockHeight(pixel_format)}; const u32 bytes_per_pixel{GetBytesPerPixel(pixel_format)}; u32 m_depth = (layer_only ? 1U : depth); - u32 m_width = std::max(1U, width / compression_factor); - u32 m_height = std::max(1U, height / compression_factor); - std::size_t size = Tegra::Texture::CalculateSize(is_tiled, bytes_per_pixel, m_width, m_height, - m_depth, block_height, block_depth); - u32 m_block_height = block_height; - u32 m_block_depth = block_depth; - std::size_t block_size_bytes = 512 * block_height * block_depth; // 512 is GOB size - for (u32 i = 1; i < max_mip_level; i++) { - m_width = std::max(1U, m_width / 2); - m_height = std::max(1U, m_height / 2); - m_depth = std::max(1U, m_depth / 2); - m_block_height = std::max(1U, m_block_height / 2); - m_block_depth = std::max(1U, m_block_depth / 2); - size += Tegra::Texture::CalculateSize(is_tiled, bytes_per_pixel, m_width, m_height, m_depth, - m_block_height, m_block_depth); + u32 m_width = MipWidth(mip_level); + u32 m_height = MipHeight(mip_level); + m_width = uncompressed ? m_width : std::max(1U, (m_width + tile_x - 1) / tile_x); + m_height = uncompressed ? m_height : std::max(1U, (m_height + tile_y - 1) / tile_y); + m_depth = std::max(1U, m_depth >> mip_level); + u32 m_block_height = MipBlockHeight(mip_level); + u32 m_block_depth = MipBlockDepth(mip_level); + return Tegra::Texture::CalculateSize(force_gl ? false : is_tiled, bytes_per_pixel, m_width, + m_height, m_depth, m_block_height, m_block_depth); +} + +std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only, + bool uncompressed) const { + std::size_t block_size_bytes = Tegra::Texture::GetGOBSize() * block_height * block_depth; + std::size_t size = 0; + for (u32 i = 0; i < max_mip_level; i++) { + size += InnerMipmapMemorySize(i, force_gl, layer_only, uncompressed); } - return is_tiled ? Common::AlignUp(size, block_size_bytes) : size; + if (!force_gl && is_tiled) { + size = Common::AlignUp(size, block_size_bytes); + } + return size; } /*static*/ SurfaceParams SurfaceParams::CreateForTexture( @@ -120,11 +108,22 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0, params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0, params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0, + params.tile_width_spacing = params.is_tiled ? (1 << config.tic.tile_width_spacing.Value()) : 1; params.srgb_conversion = config.tic.IsSrgbConversionEnabled(); params.pixel_format = PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value(), params.srgb_conversion); + + if (params.pixel_format == PixelFormat::R16U && config.tsc.depth_compare_enabled) { + // Some titles create a 'R16U' (normalized 16-bit) texture with depth_compare enabled, + // then attempt to sample from it via a shadow sampler. Convert format to Z16 (which also + // causes GetFormatType to properly return 'Depth' below). + params.pixel_format = PixelFormat::Z16; + } + params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value()); params.type = GetFormatType(params.pixel_format); + UNIMPLEMENTED_IF(params.type == SurfaceType::ColorTexture && config.tsc.depth_compare_enabled); + params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format)); params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format)); params.unaligned_height = config.tic.Height(); @@ -153,6 +152,13 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.target = SurfaceTarget::Texture2D; } break; + case SurfaceTarget::TextureCubeArray: + params.depth = config.tic.Depth() * 6; + if (!entry.IsArray()) { + ASSERT(params.depth == 6); + params.target = SurfaceTarget::TextureCubemap; + } + break; default: LOG_CRITICAL(HW_GPU, "Unknown depth for target={}", static_cast<u32>(params.target)); UNREACHABLE(); @@ -178,6 +184,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.block_width = 1 << config.memory_layout.block_width; params.block_height = 1 << config.memory_layout.block_height; params.block_depth = 1 << config.memory_layout.block_depth; + params.tile_width_spacing = 1; params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; @@ -188,7 +195,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.unaligned_height = config.height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.max_mip_level = 0; + params.max_mip_level = 1; params.is_layered = false; // Render target specific parameters, not used for caching @@ -213,6 +220,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.block_width = 1 << std::min(block_width, 5U); params.block_height = 1 << std::min(block_height, 5U); params.block_depth = 1 << std::min(block_depth, 5U); + params.tile_width_spacing = 1; params.pixel_format = PixelFormatFromDepthFormat(format); params.component_type = ComponentTypeFromDepthFormat(format); params.type = GetFormatType(params.pixel_format); @@ -222,7 +230,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.unaligned_height = zeta_height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.max_mip_level = 0; + params.max_mip_level = 1; params.is_layered = false; params.rt = {}; @@ -239,6 +247,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0, params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0, params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0, + params.tile_width_spacing = 1; params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; @@ -249,7 +258,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { params.unaligned_height = config.height; params.target = SurfaceTarget::Texture2D; params.depth = 1; - params.max_mip_level = 0; + params.max_mip_level = 1; params.rt = {}; params.InitCacheParameters(config.Address()); @@ -257,7 +266,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool layer_only) const { return params; } -static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{ +static constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false}, // ABGR8S {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // ABGR8UI @@ -269,7 +278,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // R8UI {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false}, // RGBA16F {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UNorm, false}, // RGBA16U - {GL_RGBA16UI, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RGBA16UI + {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RGBA16UI {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float, false}, // R11FG11FB10F {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI @@ -283,15 +292,13 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN2UNORM {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM - {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + {GL_COMPRESSED_RGBA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // BC7U - {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, - ComponentType::Float, true}, // BC6H_UF16 - {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, + {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, + true}, // BC6H_UF16 + {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::Float, true}, // BC6H_SF16 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4 - {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // G8R8U - {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false}, // G8R8S {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // BGRA8 {GL_RGBA32F, GL_RGBA, GL_FLOAT, ComponentType::Float, false}, // RGBA32F {GL_RG32F, GL_RG, GL_FLOAT, ComponentType::Float, false}, // RG32F @@ -324,12 +331,16 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form true}, // DXT23_SRGB {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXT45_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM_ARB, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, - ComponentType::UNorm, true}, // BC7U_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, + true}, // BC7U_SRGB {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4_SRGB {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8_SRGB {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X5_SRGB {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X4_SRGB + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5 + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_5X5_SRGB + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8 + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_10X8_SRGB // Depth formats {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F @@ -345,20 +356,22 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form ComponentType::Float, false}, // Z32FS8 }}; -static GLenum SurfaceTargetToGL(SurfaceParams::SurfaceTarget target) { +static GLenum SurfaceTargetToGL(SurfaceTarget target) { switch (target) { - case SurfaceParams::SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1D: return GL_TEXTURE_1D; - case SurfaceParams::SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2D: return GL_TEXTURE_2D; - case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceTarget::Texture3D: return GL_TEXTURE_3D; - case SurfaceParams::SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture1DArray: return GL_TEXTURE_1D_ARRAY; - case SurfaceParams::SurfaceTarget::Texture2DArray: + case SurfaceTarget::Texture2DArray: return GL_TEXTURE_2D_ARRAY; - case SurfaceParams::SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubemap: return GL_TEXTURE_CUBE_MAP; + case SurfaceTarget::TextureCubeArray: + return GL_TEXTURE_CUBE_MAP_ARRAY; } LOG_CRITICAL(Render_OpenGL, "Unimplemented texture target={}", static_cast<u32>(target)); UNREACHABLE(); @@ -373,351 +386,44 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType return format; } -MathUtil::Rectangle<u32> SurfaceParams::GetRect() const { - u32 actual_height{unaligned_height}; +MathUtil::Rectangle<u32> SurfaceParams::GetRect(u32 mip_level) const { + u32 actual_height{std::max(1U, unaligned_height >> mip_level)}; if (IsPixelFormatASTC(pixel_format)) { // ASTC formats must stop at the ATSC block size boundary actual_height = Common::AlignDown(actual_height, GetASTCBlockSize(pixel_format).second); } - return {0, actual_height, width, 0}; -} - -/// Returns true if the specified PixelFormat is a BCn format, e.g. DXT or DXN -static bool IsFormatBCn(PixelFormat format) { - switch (format) { - case PixelFormat::DXT1: - case PixelFormat::DXT23: - case PixelFormat::DXT45: - case PixelFormat::DXN1: - case PixelFormat::DXN2SNORM: - case PixelFormat::DXN2UNORM: - case PixelFormat::BC7U: - case PixelFormat::BC6H_UF16: - case PixelFormat::BC6H_SF16: - case PixelFormat::DXT1_SRGB: - case PixelFormat::DXT23_SRGB: - case PixelFormat::DXT45_SRGB: - case PixelFormat::BC7U_SRGB: - return true; - } - return false; -} - -template <bool morton_to_gl, PixelFormat format> -void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth, u8* gl_buffer, - std::size_t gl_buffer_size, VAddr addr) { - constexpr u32 bytes_per_pixel = SurfaceParams::GetBytesPerPixel(format); - - // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual - // pixel values. - const u32 tile_size{IsFormatBCn(format) ? 4U : 1U}; - - if (morton_to_gl) { - const std::vector<u8> data = Tegra::Texture::UnswizzleTexture( - addr, tile_size, bytes_per_pixel, stride, height, depth, block_height, block_depth); - const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())}; - memcpy(gl_buffer, data.data(), size_to_copy); - } else { - Tegra::Texture::CopySwizzledData(stride / tile_size, height / tile_size, depth, - bytes_per_pixel, bytes_per_pixel, Memory::GetPointer(addr), - gl_buffer, false, block_height, block_depth); - } + return {0, actual_height, MipWidth(mip_level), 0}; } -using GLConversionArray = std::array<void (*)(u32, u32, u32, u32, u32, u8*, std::size_t, VAddr), - SurfaceParams::MaxPixelFormat>; - -static constexpr GLConversionArray morton_to_gl_fns = { - // clang-format off - MortonCopy<true, PixelFormat::ABGR8U>, - MortonCopy<true, PixelFormat::ABGR8S>, - MortonCopy<true, PixelFormat::ABGR8UI>, - MortonCopy<true, PixelFormat::B5G6R5U>, - MortonCopy<true, PixelFormat::A2B10G10R10U>, - MortonCopy<true, PixelFormat::A1B5G5R5U>, - MortonCopy<true, PixelFormat::R8U>, - MortonCopy<true, PixelFormat::R8UI>, - MortonCopy<true, PixelFormat::RGBA16F>, - MortonCopy<true, PixelFormat::RGBA16U>, - MortonCopy<true, PixelFormat::RGBA16UI>, - MortonCopy<true, PixelFormat::R11FG11FB10F>, - MortonCopy<true, PixelFormat::RGBA32UI>, - MortonCopy<true, PixelFormat::DXT1>, - MortonCopy<true, PixelFormat::DXT23>, - MortonCopy<true, PixelFormat::DXT45>, - MortonCopy<true, PixelFormat::DXN1>, - MortonCopy<true, PixelFormat::DXN2UNORM>, - MortonCopy<true, PixelFormat::DXN2SNORM>, - MortonCopy<true, PixelFormat::BC7U>, - MortonCopy<true, PixelFormat::BC6H_UF16>, - MortonCopy<true, PixelFormat::BC6H_SF16>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4>, - MortonCopy<true, PixelFormat::G8R8U>, - MortonCopy<true, PixelFormat::G8R8S>, - MortonCopy<true, PixelFormat::BGRA8>, - MortonCopy<true, PixelFormat::RGBA32F>, - MortonCopy<true, PixelFormat::RG32F>, - MortonCopy<true, PixelFormat::R32F>, - MortonCopy<true, PixelFormat::R16F>, - MortonCopy<true, PixelFormat::R16U>, - MortonCopy<true, PixelFormat::R16S>, - MortonCopy<true, PixelFormat::R16UI>, - MortonCopy<true, PixelFormat::R16I>, - MortonCopy<true, PixelFormat::RG16>, - MortonCopy<true, PixelFormat::RG16F>, - MortonCopy<true, PixelFormat::RG16UI>, - MortonCopy<true, PixelFormat::RG16I>, - MortonCopy<true, PixelFormat::RG16S>, - MortonCopy<true, PixelFormat::RGB32F>, - MortonCopy<true, PixelFormat::RGBA8_SRGB>, - MortonCopy<true, PixelFormat::RG8U>, - MortonCopy<true, PixelFormat::RG8S>, - MortonCopy<true, PixelFormat::RG32UI>, - MortonCopy<true, PixelFormat::R32UI>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4>, - MortonCopy<true, PixelFormat::BGRA8_SRGB>, - MortonCopy<true, PixelFormat::DXT1_SRGB>, - MortonCopy<true, PixelFormat::DXT23_SRGB>, - MortonCopy<true, PixelFormat::DXT45_SRGB>, - MortonCopy<true, PixelFormat::BC7U_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>, - MortonCopy<true, PixelFormat::Z32F>, - MortonCopy<true, PixelFormat::Z16>, - MortonCopy<true, PixelFormat::Z24S8>, - MortonCopy<true, PixelFormat::S8Z24>, - MortonCopy<true, PixelFormat::Z32FS8>, - // clang-format on -}; - -static constexpr GLConversionArray gl_to_morton_fns = { - // clang-format off - MortonCopy<false, PixelFormat::ABGR8U>, - MortonCopy<false, PixelFormat::ABGR8S>, - MortonCopy<false, PixelFormat::ABGR8UI>, - MortonCopy<false, PixelFormat::B5G6R5U>, - MortonCopy<false, PixelFormat::A2B10G10R10U>, - MortonCopy<false, PixelFormat::A1B5G5R5U>, - MortonCopy<false, PixelFormat::R8U>, - MortonCopy<false, PixelFormat::R8UI>, - MortonCopy<false, PixelFormat::RGBA16F>, - MortonCopy<false, PixelFormat::RGBA16U>, - MortonCopy<false, PixelFormat::RGBA16UI>, - MortonCopy<false, PixelFormat::R11FG11FB10F>, - MortonCopy<false, PixelFormat::RGBA32UI>, - MortonCopy<false, PixelFormat::DXT1>, - MortonCopy<false, PixelFormat::DXT23>, - MortonCopy<false, PixelFormat::DXT45>, - MortonCopy<false, PixelFormat::DXN1>, - MortonCopy<false, PixelFormat::DXN2UNORM>, - MortonCopy<false, PixelFormat::DXN2SNORM>, - MortonCopy<false, PixelFormat::BC7U>, - MortonCopy<false, PixelFormat::BC6H_UF16>, - MortonCopy<false, PixelFormat::BC6H_SF16>, - // TODO(Subv): Swizzling ASTC formats are not supported - nullptr, - MortonCopy<false, PixelFormat::G8R8U>, - MortonCopy<false, PixelFormat::G8R8S>, - MortonCopy<false, PixelFormat::BGRA8>, - MortonCopy<false, PixelFormat::RGBA32F>, - MortonCopy<false, PixelFormat::RG32F>, - MortonCopy<false, PixelFormat::R32F>, - MortonCopy<false, PixelFormat::R16F>, - MortonCopy<false, PixelFormat::R16U>, - MortonCopy<false, PixelFormat::R16S>, - MortonCopy<false, PixelFormat::R16UI>, - MortonCopy<false, PixelFormat::R16I>, - MortonCopy<false, PixelFormat::RG16>, - MortonCopy<false, PixelFormat::RG16F>, - MortonCopy<false, PixelFormat::RG16UI>, - MortonCopy<false, PixelFormat::RG16I>, - MortonCopy<false, PixelFormat::RG16S>, - MortonCopy<false, PixelFormat::RGB32F>, - MortonCopy<false, PixelFormat::RGBA8_SRGB>, - MortonCopy<false, PixelFormat::RG8U>, - MortonCopy<false, PixelFormat::RG8S>, - MortonCopy<false, PixelFormat::RG32UI>, - MortonCopy<false, PixelFormat::R32UI>, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::BGRA8_SRGB>, - MortonCopy<false, PixelFormat::DXT1_SRGB>, - MortonCopy<false, PixelFormat::DXT23_SRGB>, - MortonCopy<false, PixelFormat::DXT45_SRGB>, - MortonCopy<false, PixelFormat::BC7U_SRGB>, - nullptr, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::Z32F>, - MortonCopy<false, PixelFormat::Z16>, - MortonCopy<false, PixelFormat::Z24S8>, - MortonCopy<false, PixelFormat::S8Z24>, - MortonCopy<false, PixelFormat::Z32FS8>, - // clang-format on -}; - -void SwizzleFunc(const GLConversionArray& functions, const SurfaceParams& params, - std::vector<u8>& gl_buffer) { - u32 depth = params.depth; - if (params.target == SurfaceParams::SurfaceTarget::Texture2D) { +void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params, + std::vector<u8>& gl_buffer, u32 mip_level) { + u32 depth = params.MipDepth(mip_level); + if (params.target == SurfaceTarget::Texture2D) { // TODO(Blinkhawk): Eliminate this condition once all texture types are implemented. depth = 1U; } if (params.is_layered) { - u64 offset = 0; + u64 offset = params.GetMipmapLevelOffset(mip_level); u64 offset_gl = 0; - u64 layer_size = params.LayerMemorySize(); - u64 gl_size = params.LayerSizeGL(); - for (u32 i = 0; i < depth; i++) { - functions[static_cast<std::size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, params.block_depth, 1, - gl_buffer.data() + offset_gl, gl_size, params.addr + offset); + const u64 layer_size = params.LayerMemorySize(); + const u64 gl_size = params.LayerSizeGL(mip_level); + for (u32 i = 0; i < params.depth; i++) { + MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level), + params.MipBlockHeight(mip_level), params.MipHeight(mip_level), + params.MipBlockDepth(mip_level), params.tile_width_spacing, 1, + gl_buffer.data() + offset_gl, gl_size, params.addr + offset); offset += layer_size; offset_gl += gl_size; } } else { - functions[static_cast<std::size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, params.block_depth, depth, - gl_buffer.data(), gl_buffer.size(), params.addr); + const u64 offset = params.GetMipmapLevelOffset(mip_level); + MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level), + params.MipBlockHeight(mip_level), params.MipHeight(mip_level), + params.MipBlockDepth(mip_level), depth, params.tile_width_spacing, + gl_buffer.data(), gl_buffer.size(), params.addr + offset); } } -static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface, - GLuint read_fb_handle, GLuint draw_fb_handle, GLenum src_attachment = 0, - GLenum dst_attachment = 0, std::size_t cubemap_face = 0) { - - const auto& src_params{src_surface->GetSurfaceParams()}; - const auto& dst_params{dst_surface->GetSurfaceParams()}; - - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ prev_state.Apply(); }); - - OpenGLState state; - state.draw.read_framebuffer = read_fb_handle; - state.draw.draw_framebuffer = draw_fb_handle; - // Set sRGB enabled if the destination surfaces need it - state.framebuffer_srgb.enabled = dst_params.srgb_conversion; - state.Apply(); - - u32 buffers{}; - - if (src_params.type == SurfaceType::ColorTexture) { - switch (src_params.target) { - case SurfaceParams::SurfaceTarget::Texture2D: - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - case SurfaceParams::SurfaceTarget::TextureCubemap: - glFramebufferTexture2D( - GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), - src_surface->Texture().handle, 0); - glFramebufferTexture2D( - GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0); - break; - case SurfaceParams::SurfaceTarget::Texture2DArray: - glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - src_surface->Texture().handle, 0, 0); - glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0); - break; - case SurfaceParams::SurfaceTarget::Texture3D: - glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - SurfaceTargetToGL(src_params.target), - src_surface->Texture().handle, 0, 0); - glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - SurfaceTargetToGL(src_params.target), 0, 0, 0); - break; - default: - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - } - - switch (dst_params.target) { - case SurfaceParams::SurfaceTarget::Texture2D: - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - case SurfaceParams::SurfaceTarget::TextureCubemap: - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), - dst_surface->Texture().handle, 0); - glFramebufferTexture2D( - GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0); - break; - case SurfaceParams::SurfaceTarget::Texture2DArray: - glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - dst_surface->Texture().handle, 0, 0); - glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0); - break; - - case SurfaceParams::SurfaceTarget::Texture3D: - glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - SurfaceTargetToGL(dst_params.target), - dst_surface->Texture().handle, 0, 0); - glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, - SurfaceTargetToGL(dst_params.target), 0, 0, 0); - break; - default: - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - 0, 0); - break; - } - - buffers = GL_COLOR_BUFFER_BIT; - } else if (src_params.type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - src_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->Texture().handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - - buffers = GL_DEPTH_BUFFER_BIT; - } else if (src_params.type == SurfaceType::DepthStencil) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - src_surface->Texture().handle, 0); - - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment, - GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->Texture().handle, 0); - - buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; - } - - const auto& rect{src_params.GetRect()}; - glBlitFramebuffer(rect.left, rect.bottom, rect.right, rect.top, rect.left, rect.bottom, - rect.right, rect.top, buffers, - buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST); - - return true; -} - static void FastCopySurface(const Surface& src_surface, const Surface& dst_surface) { const auto& src_params{src_surface->GetSurfaceParams()}; const auto& dst_params{dst_surface->GetSurfaceParams()}; @@ -730,21 +436,23 @@ static void FastCopySurface(const Surface& src_surface, const Surface& dst_surfa 0, 0, width, height, 1); } +MICROPROFILE_DEFINE(OpenGL_CopySurface, "OpenGL", "CopySurface", MP_RGB(128, 192, 64)); static void CopySurface(const Surface& src_surface, const Surface& dst_surface, - GLuint copy_pbo_handle, GLenum src_attachment = 0, - GLenum dst_attachment = 0, std::size_t cubemap_face = 0) { + const GLuint copy_pbo_handle, const GLenum src_attachment = 0, + const GLenum dst_attachment = 0, const std::size_t cubemap_face = 0) { + MICROPROFILE_SCOPE(OpenGL_CopySurface); ASSERT_MSG(dst_attachment == 0, "Unimplemented"); const auto& src_params{src_surface->GetSurfaceParams()}; const auto& dst_params{dst_surface->GetSurfaceParams()}; - auto source_format = GetFormatTuple(src_params.pixel_format, src_params.component_type); - auto dest_format = GetFormatTuple(dst_params.pixel_format, dst_params.component_type); + const auto source_format = GetFormatTuple(src_params.pixel_format, src_params.component_type); + const auto dest_format = GetFormatTuple(dst_params.pixel_format, dst_params.component_type); - std::size_t buffer_size = std::max(src_params.size_in_bytes, dst_params.size_in_bytes); + const std::size_t buffer_size = std::max(src_params.size_in_bytes, dst_params.size_in_bytes); glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); - glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW_ARB); + glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW); if (source_format.compressed) { glGetCompressedTextureImage(src_surface->Texture().handle, src_attachment, static_cast<GLsizei>(src_params.size_in_bytes), nullptr); @@ -765,13 +473,10 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface, LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during " "reinterpretation but the texture is tiled."); } - std::size_t remaining_size = dst_params.size_in_bytes - src_params.size_in_bytes; - std::vector<u8> data(remaining_size); - std::memcpy(data.data(), Memory::GetPointer(dst_params.addr + src_params.size_in_bytes), - data.size()); + const std::size_t remaining_size = dst_params.size_in_bytes - src_params.size_in_bytes; glBufferSubData(GL_PIXEL_PACK_BUFFER, src_params.size_in_bytes, remaining_size, - data.data()); + Memory::GetPointer(dst_params.addr + src_params.size_in_bytes)); } glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -787,21 +492,22 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface, UNREACHABLE(); } else { switch (dst_params.target) { - case SurfaceParams::SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1D: glTextureSubImage1D(dst_surface->Texture().handle, 0, 0, width, dest_format.format, dest_format.type, nullptr); break; - case SurfaceParams::SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2D: glTextureSubImage2D(dst_surface->Texture().handle, 0, 0, 0, width, height, dest_format.format, dest_format.type, nullptr); break; - case SurfaceParams::SurfaceTarget::Texture3D: - case SurfaceParams::SurfaceTarget::Texture2DArray: + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: glTextureSubImage3D(dst_surface->Texture().handle, 0, 0, 0, 0, width, height, static_cast<GLsizei>(dst_params.depth), dest_format.format, dest_format.type, nullptr); break; - case SurfaceParams::SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubemap: glTextureSubImage3D(dst_surface->Texture().handle, 0, 0, 0, static_cast<GLint>(cubemap_face), width, height, 1, dest_format.format, dest_format.type, nullptr); @@ -835,38 +541,41 @@ CachedSurface::CachedSurface(const SurfaceParams& params) glActiveTexture(GL_TEXTURE0); const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type); + gl_internal_format = format_tuple.internal_format; + gl_is_compressed = format_tuple.compressed; + if (!format_tuple.compressed) { // Only pre-create the texture for non-compressed textures. switch (params.target) { - case SurfaceParams::SurfaceTarget::Texture1D: - glTexStorage1D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, - rect.GetWidth()); + case SurfaceTarget::Texture1D: + glTexStorage1D(SurfaceTargetToGL(params.target), params.max_mip_level, + format_tuple.internal_format, rect.GetWidth()); break; - case SurfaceParams::SurfaceTarget::Texture2D: - case SurfaceParams::SurfaceTarget::TextureCubemap: - glTexStorage2D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, - rect.GetWidth(), rect.GetHeight()); + case SurfaceTarget::Texture2D: + case SurfaceTarget::TextureCubemap: + glTexStorage2D(SurfaceTargetToGL(params.target), params.max_mip_level, + format_tuple.internal_format, rect.GetWidth(), rect.GetHeight()); break; - case SurfaceParams::SurfaceTarget::Texture3D: - case SurfaceParams::SurfaceTarget::Texture2DArray: - glTexStorage3D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, - rect.GetWidth(), rect.GetHeight(), params.depth); + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glTexStorage3D(SurfaceTargetToGL(params.target), params.max_mip_level, + format_tuple.internal_format, rect.GetWidth(), rect.GetHeight(), + params.depth); break; default: LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", static_cast<u32>(params.target)); UNREACHABLE(); - glTexStorage2D(GL_TEXTURE_2D, 1, format_tuple.internal_format, rect.GetWidth(), - rect.GetHeight()); + glTexStorage2D(GL_TEXTURE_2D, params.max_mip_level, format_tuple.internal_format, + rect.GetWidth(), rect.GetHeight()); } } - glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + ApplyTextureDefaults(SurfaceTargetToGL(params.target), params.max_mip_level); - VideoCore::LabelGLObject(GL_TEXTURE, texture.handle, params.addr, - SurfaceParams::SurfaceTargetName(params.target)); + LabelGLObject(GL_TEXTURE, texture.handle, params.addr, + SurfaceParams::SurfaceTargetName(params.target)); // Clamp size to mapped GPU memory region // TODO(bunnei): Super Mario Odyssey maps a 0x40000 byte region and then uses it for a 0x80000 @@ -896,7 +605,7 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bo S8Z24 s8z24_pixel{}; Z24S8 z24s8_pixel{}; - constexpr auto bpp{SurfaceParams::GetBytesPerPixel(PixelFormat::S8Z24)}; + constexpr auto bpp{GetBytesPerPixel(PixelFormat::S8Z24)}; for (std::size_t y = 0; y < height; ++y) { for (std::size_t x = 0; x < width; ++x) { const std::size_t offset{bpp * (y * width + x)}; @@ -915,51 +624,38 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bo } } -static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) { - constexpr auto bpp{SurfaceParams::GetBytesPerPixel(PixelFormat::G8R8U)}; - for (std::size_t y = 0; y < height; ++y) { - for (std::size_t x = 0; x < width; ++x) { - const std::size_t offset{bpp * (y * width + x)}; - const u8 temp{data[offset]}; - data[offset] = data[offset + 1]; - data[offset + 1] = temp; - } - } -} - /** * Helper function to perform software conversion (as needed) when loading a buffer from Switch * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with * typical desktop GPUs. */ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format, - u32 width, u32 height) { + u32 width, u32 height, u32 depth) { switch (pixel_format) { case PixelFormat::ASTC_2D_4X4: case PixelFormat::ASTC_2D_8X8: case PixelFormat::ASTC_2D_8X5: case PixelFormat::ASTC_2D_5X4: + case PixelFormat::ASTC_2D_5X5: case PixelFormat::ASTC_2D_4X4_SRGB: case PixelFormat::ASTC_2D_8X8_SRGB: case PixelFormat::ASTC_2D_8X5_SRGB: - case PixelFormat::ASTC_2D_5X4_SRGB: { + case PixelFormat::ASTC_2D_5X4_SRGB: + case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X8: + case PixelFormat::ASTC_2D_10X8_SRGB: { // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC. u32 block_width{}; u32 block_height{}; std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format); - data = Tegra::Texture::ASTC::Decompress(data, width, height, block_width, block_height); + data = + Tegra::Texture::ASTC::Decompress(data, width, height, depth, block_width, block_height); break; } case PixelFormat::S8Z24: // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24. ConvertS8Z24ToZ24S8(data, width, height, false); break; - - case PixelFormat::G8R8U: - case PixelFormat::G8R8S: - // Convert the G8R8 color format to R8G8, as OpenGL does not support G8R8. - ConvertG8R8ToR8G8(data, width, height); - break; } } @@ -971,12 +667,14 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelFormat pixel_format, u32 width, u32 height) { switch (pixel_format) { - case PixelFormat::G8R8U: - case PixelFormat::G8R8S: case PixelFormat::ASTC_2D_4X4: case PixelFormat::ASTC_2D_8X8: case PixelFormat::ASTC_2D_4X4_SRGB: - case PixelFormat::ASTC_2D_8X8_SRGB: { + case PixelFormat::ASTC_2D_8X8_SRGB: + case PixelFormat::ASTC_2D_5X5: + case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X8: + case PixelFormat::ASTC_2D_10X8_SRGB: { LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented", static_cast<u32>(pixel_format)); UNREACHABLE(); @@ -989,23 +687,26 @@ static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelForm } } -MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192)); +MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64)); void CachedSurface::LoadGLBuffer() { MICROPROFILE_SCOPE(OpenGL_SurfaceLoad); - - gl_buffer.resize(params.size_in_bytes_gl); + gl_buffer.resize(params.max_mip_level); + for (u32 i = 0; i < params.max_mip_level; i++) + gl_buffer[i].resize(params.GetMipmapSizeGL(i)); if (params.is_tiled) { ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", params.block_width, static_cast<u32>(params.target)); - - SwizzleFunc(morton_to_gl_fns, params, gl_buffer); + for (u32 i = 0; i < params.max_mip_level; i++) + SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i); } else { const auto texture_src_data{Memory::GetPointer(params.addr)}; const auto texture_src_data_end{texture_src_data + params.size_in_bytes_gl}; - gl_buffer.assign(texture_src_data, texture_src_data_end); + gl_buffer[0].assign(texture_src_data, texture_src_data_end); + } + for (u32 i = 0; i < params.max_mip_level; i++) { + ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer[i], params.pixel_format, params.MipWidth(i), + params.MipHeight(i), params.MipDepth(i)); } - - ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer, params.pixel_format, params.width, params.height); } MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64)); @@ -1015,18 +716,19 @@ void CachedSurface::FlushGLBuffer() { ASSERT_MSG(!IsPixelFormatASTC(params.pixel_format), "Unimplemented"); // OpenGL temporary buffer needs to be big enough to store raw texture size - gl_buffer.resize(GetSizeInBytes()); + gl_buffer.resize(1); + gl_buffer[0].resize(GetSizeInBytes()); const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); // Ensure no bad interactions with GL_UNPACK_ALIGNMENT - ASSERT(params.width * SurfaceParams::GetBytesPerPixel(params.pixel_format) % 4 == 0); + ASSERT(params.width * GetBytesPerPixel(params.pixel_format) % 4 == 0); glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width)); ASSERT(!tuple.compressed); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); glGetTextureImage(texture.handle, 0, tuple.format, tuple.type, - static_cast<GLsizei>(gl_buffer.size()), gl_buffer.data()); + static_cast<GLsizei>(gl_buffer[0].size()), gl_buffer[0].data()); glPixelStorei(GL_PACK_ROW_LENGTH, 0); - ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width, + ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer[0], params.pixel_format, params.width, params.height); ASSERT(params.type != SurfaceType::Fill); const u8* const texture_src_data = Memory::GetPointer(params.addr); @@ -1035,28 +737,23 @@ void CachedSurface::FlushGLBuffer() { ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", params.block_width, static_cast<u32>(params.target)); - SwizzleFunc(gl_to_morton_fns, params, gl_buffer); + SwizzleFunc(MortonSwizzleMode::LinearToMorton, params, gl_buffer[0], 0); } else { - std::memcpy(Memory::GetPointer(GetAddr()), gl_buffer.data(), GetSizeInBytes()); + std::memcpy(Memory::GetPointer(GetAddr()), gl_buffer[0].data(), GetSizeInBytes()); } } -MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192)); -void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) { - if (params.type == SurfaceType::Fill) - return; - - MICROPROFILE_SCOPE(OpenGL_TextureUL); - - const auto& rect{params.GetRect()}; +void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, + GLuint draw_fb_handle) { + const auto& rect{params.GetRect(mip_map)}; // Load data from memory to the surface const GLint x0 = static_cast<GLint>(rect.left); const GLint y0 = static_cast<GLint>(rect.bottom); std::size_t buffer_offset = - static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.width + + static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.MipWidth(mip_map) + static_cast<std::size_t>(x0)) * - SurfaceParams::GetBytesPerPixel(params.pixel_format); + GetBytesPerPixel(params.pixel_format); const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); const GLuint target_tex = texture.handle; @@ -1072,89 +769,145 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle cur_state.Apply(); // Ensure no bad interactions with GL_UNPACK_ALIGNMENT - ASSERT(params.width * SurfaceParams::GetBytesPerPixel(params.pixel_format) % 4 == 0); - glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.width)); + ASSERT(params.MipWidth(mip_map) * GetBytesPerPixel(params.pixel_format) % 4 == 0); + glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.MipWidth(mip_map))); + GLsizei image_size = static_cast<GLsizei>(params.GetMipmapSizeGL(mip_map, false)); glActiveTexture(GL_TEXTURE0); if (tuple.compressed) { switch (params.target) { - case SurfaceParams::SurfaceTarget::Texture2D: - glCompressedTexImage2D( - SurfaceTargetToGL(params.target), 0, tuple.internal_format, - static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); + case SurfaceTarget::Texture2D: + glCompressedTexImage2D(SurfaceTargetToGL(params.target), mip_map, tuple.internal_format, + static_cast<GLsizei>(params.MipWidth(mip_map)), + static_cast<GLsizei>(params.MipHeight(mip_map)), 0, image_size, + &gl_buffer[mip_map][buffer_offset]); + break; + case SurfaceTarget::Texture3D: + glCompressedTexImage3D(SurfaceTargetToGL(params.target), mip_map, tuple.internal_format, + static_cast<GLsizei>(params.MipWidth(mip_map)), + static_cast<GLsizei>(params.MipHeight(mip_map)), + static_cast<GLsizei>(params.MipDepth(mip_map)), 0, image_size, + &gl_buffer[mip_map][buffer_offset]); break; - case SurfaceParams::SurfaceTarget::Texture3D: - case SurfaceParams::SurfaceTarget::Texture2DArray: - glCompressedTexImage3D( - SurfaceTargetToGL(params.target), 0, tuple.internal_format, - static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), - static_cast<GLsizei>(params.depth), 0, - static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glCompressedTexImage3D(SurfaceTargetToGL(params.target), mip_map, tuple.internal_format, + static_cast<GLsizei>(params.MipWidth(mip_map)), + static_cast<GLsizei>(params.MipHeight(mip_map)), + static_cast<GLsizei>(params.depth), 0, image_size, + &gl_buffer[mip_map][buffer_offset]); break; - case SurfaceParams::SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubemap: { + GLsizei layer_size = static_cast<GLsizei>(params.LayerSizeGL(mip_map)); for (std::size_t face = 0; face < params.depth; ++face) { glCompressedTexImage2D(static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face), - 0, tuple.internal_format, static_cast<GLsizei>(params.width), - static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.SizeInBytesCubeFaceGL()), - &gl_buffer[buffer_offset]); - buffer_offset += params.SizeInBytesCubeFace(); + mip_map, tuple.internal_format, + static_cast<GLsizei>(params.MipWidth(mip_map)), + static_cast<GLsizei>(params.MipHeight(mip_map)), 0, + layer_size, &gl_buffer[mip_map][buffer_offset]); + buffer_offset += layer_size; } break; + } default: LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", static_cast<u32>(params.target)); UNREACHABLE(); - glCompressedTexImage2D( - GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width), - static_cast<GLsizei>(params.height), 0, - static_cast<GLsizei>(params.size_in_bytes_gl), &gl_buffer[buffer_offset]); + glCompressedTexImage2D(GL_TEXTURE_2D, mip_map, tuple.internal_format, + static_cast<GLsizei>(params.MipWidth(mip_map)), + static_cast<GLsizei>(params.MipHeight(mip_map)), 0, + static_cast<GLsizei>(params.size_in_bytes_gl), + &gl_buffer[mip_map][buffer_offset]); } } else { switch (params.target) { - case SurfaceParams::SurfaceTarget::Texture1D: - glTexSubImage1D(SurfaceTargetToGL(params.target), 0, x0, + case SurfaceTarget::Texture1D: + glTexSubImage1D(SurfaceTargetToGL(params.target), mip_map, x0, static_cast<GLsizei>(rect.GetWidth()), tuple.format, tuple.type, - &gl_buffer[buffer_offset]); + &gl_buffer[mip_map][buffer_offset]); break; - case SurfaceParams::SurfaceTarget::Texture2D: - glTexSubImage2D(SurfaceTargetToGL(params.target), 0, x0, y0, + case SurfaceTarget::Texture2D: + glTexSubImage2D(SurfaceTargetToGL(params.target), mip_map, x0, y0, static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[buffer_offset]); + &gl_buffer[mip_map][buffer_offset]); + break; + case SurfaceTarget::Texture3D: + glTexSubImage3D(SurfaceTargetToGL(params.target), mip_map, x0, y0, 0, + static_cast<GLsizei>(rect.GetWidth()), + static_cast<GLsizei>(rect.GetHeight()), params.MipDepth(mip_map), + tuple.format, tuple.type, &gl_buffer[mip_map][buffer_offset]); break; - case SurfaceParams::SurfaceTarget::Texture3D: - case SurfaceParams::SurfaceTarget::Texture2DArray: - glTexSubImage3D(SurfaceTargetToGL(params.target), 0, x0, y0, 0, + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + glTexSubImage3D(SurfaceTargetToGL(params.target), mip_map, x0, y0, 0, static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), params.depth, tuple.format, - tuple.type, &gl_buffer[buffer_offset]); + tuple.type, &gl_buffer[mip_map][buffer_offset]); break; - case SurfaceParams::SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubemap: { + std::size_t start = buffer_offset; for (std::size_t face = 0; face < params.depth; ++face) { - glTexSubImage2D(static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face), 0, x0, - y0, static_cast<GLsizei>(rect.GetWidth()), + glTexSubImage2D(static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face), mip_map, + x0, y0, static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[buffer_offset]); - buffer_offset += params.SizeInBytesCubeFace(); + &gl_buffer[mip_map][buffer_offset]); + buffer_offset += params.LayerSizeGL(mip_map); } break; + } default: LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", static_cast<u32>(params.target)); UNREACHABLE(); - glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()), + glTexSubImage2D(GL_TEXTURE_2D, mip_map, x0, y0, static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[buffer_offset]); + &gl_buffer[mip_map][buffer_offset]); } } glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); } -RasterizerCacheOpenGL::RasterizerCacheOpenGL() { +void CachedSurface::EnsureTextureView() { + if (texture_view.handle != 0) + return; + // Compressed texture are not being created with immutable storage + UNIMPLEMENTED_IF(gl_is_compressed); + + const GLenum target{TargetLayer()}; + + texture_view.Create(); + glTextureView(texture_view.handle, target, texture.handle, gl_internal_format, 0, + params.max_mip_level, 0, 1); + + OpenGLState cur_state = OpenGLState::GetCurState(); + const auto& old_tex = cur_state.texture_units[0]; + SCOPE_EXIT({ + cur_state.texture_units[0] = old_tex; + cur_state.Apply(); + }); + cur_state.texture_units[0].texture = texture_view.handle; + cur_state.texture_units[0].target = target; + cur_state.Apply(); + + ApplyTextureDefaults(target, params.max_mip_level); +} + +MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 192, 64)); +void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) { + if (params.type == SurfaceType::Fill) + return; + + MICROPROFILE_SCOPE(OpenGL_TextureUL); + + for (u32 i = 0; i < params.max_mip_level; i++) + UploadGLMipmapTexture(i, read_fb_handle, draw_fb_handle); +} + +RasterizerCacheOpenGL::RasterizerCacheOpenGL(RasterizerOpenGL& rasterizer) + : RasterizerCache{rasterizer} { read_framebuffer.Create(); draw_framebuffer.Create(); copy_pbo.Create(); @@ -1249,6 +1002,31 @@ Surface RasterizerCacheOpenGL::GetUncachedSurface(const SurfaceParams& params) { return surface; } +void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface, + const Surface& dst_surface) { + const auto& init_params{src_surface->GetSurfaceParams()}; + const auto& dst_params{dst_surface->GetSurfaceParams()}; + VAddr address = init_params.addr; + const std::size_t layer_size = dst_params.LayerMemorySize(); + for (u32 layer = 0; layer < dst_params.depth; layer++) { + for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) { + const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap); + const Surface& copy = TryGet(sub_address); + if (!copy) + continue; + const auto& src_params{copy->GetSurfaceParams()}; + const u32 width{std::min(src_params.width, dst_params.MipWidth(mipmap))}; + const u32 height{std::min(src_params.height, dst_params.MipHeight(mipmap))}; + + glCopyImageSubData(copy->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0, + 0, 0, dst_surface->Texture().handle, + SurfaceTargetToGL(dst_params.target), mipmap, 0, 0, layer, width, + height, 1); + } + address += layer_size; + } +} + void RasterizerCacheOpenGL::FermiCopySurface( const Tegra::Engines::Fermi2D::Regs::Surface& src_config, const Tegra::Engines::Fermi2D::Regs::Surface& dst_config) { @@ -1273,7 +1051,10 @@ void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface) { const auto& src_params{src_surface->GetSurfaceParams()}; const auto& dst_params{dst_surface->GetSurfaceParams()}; - FlushRegion(src_params.addr, dst_params.MemorySize()); + + // Flush enough memory for both the source and destination surface + FlushRegion(src_params.addr, std::max(src_params.MemorySize(), dst_params.MemorySize())); + LoadSurface(dst_surface); } @@ -1294,31 +1075,23 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, // For compatible surfaces, we can just do fast glCopyImageSubData based copy if (old_params.target == new_params.target && old_params.type == new_params.type && old_params.depth == new_params.depth && old_params.depth == 1 && - SurfaceParams::GetFormatBpp(old_params.pixel_format) == - SurfaceParams::GetFormatBpp(new_params.pixel_format)) { + GetFormatBpp(old_params.pixel_format) == GetFormatBpp(new_params.pixel_format)) { FastCopySurface(old_surface, new_surface); return new_surface; } - // If the format is the same, just do a framebuffer blit. This is significantly faster than - // using PBOs. The is also likely less accurate, as textures will be converted rather than - // reinterpreted. When use_accurate_gpu_emulation setting is enabled, perform a more accurate - // surface copy, where pixels are reinterpreted as a new format (without conversion). This - // code path uses OpenGL PBOs and is quite slow. - const bool is_blit{old_params.pixel_format == new_params.pixel_format}; - switch (new_params.target) { - case SurfaceParams::SurfaceTarget::Texture2D: - if (is_blit) { - BlitSurface(old_surface, new_surface, read_framebuffer.handle, draw_framebuffer.handle); - } else { - CopySurface(old_surface, new_surface, copy_pbo.handle); - } + case SurfaceTarget::Texture2D: + CopySurface(old_surface, new_surface, copy_pbo.handle); break; - case SurfaceParams::SurfaceTarget::TextureCubemap: - case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceTarget::Texture3D: AccurateCopySurface(old_surface, new_surface); break; + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubeArray: + FastLayeredCopySurface(old_surface, new_surface); + break; default: LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", static_cast<u32>(new_params.target)); @@ -1326,7 +1099,7 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface, } return new_surface; -} // namespace OpenGL +} Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr addr) const { return TryGet(addr); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 6a49880c2..7223700c4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -7,6 +7,7 @@ #include <array> #include <map> #include <memory> +#include <string> #include <vector> #include "common/alignment.h" @@ -18,6 +19,7 @@ #include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_gen.h" +#include "video_core/surface.h" #include "video_core/textures/decoders.h" #include "video_core/textures/texture.h" @@ -27,135 +29,12 @@ class CachedSurface; using Surface = std::shared_ptr<CachedSurface>; using SurfaceSurfaceRect_Tuple = std::tuple<Surface, Surface, MathUtil::Rectangle<u32>>; -struct SurfaceParams { - enum class PixelFormat { - ABGR8U = 0, - ABGR8S = 1, - ABGR8UI = 2, - B5G6R5U = 3, - A2B10G10R10U = 4, - A1B5G5R5U = 5, - R8U = 6, - R8UI = 7, - RGBA16F = 8, - RGBA16U = 9, - RGBA16UI = 10, - R11FG11FB10F = 11, - RGBA32UI = 12, - DXT1 = 13, - DXT23 = 14, - DXT45 = 15, - DXN1 = 16, // This is also known as BC4 - DXN2UNORM = 17, - DXN2SNORM = 18, - BC7U = 19, - BC6H_UF16 = 20, - BC6H_SF16 = 21, - ASTC_2D_4X4 = 22, - G8R8U = 23, - G8R8S = 24, - BGRA8 = 25, - RGBA32F = 26, - RG32F = 27, - R32F = 28, - R16F = 29, - R16U = 30, - R16S = 31, - R16UI = 32, - R16I = 33, - RG16 = 34, - RG16F = 35, - RG16UI = 36, - RG16I = 37, - RG16S = 38, - RGB32F = 39, - RGBA8_SRGB = 40, - RG8U = 41, - RG8S = 42, - RG32UI = 43, - R32UI = 44, - ASTC_2D_8X8 = 45, - ASTC_2D_8X5 = 46, - ASTC_2D_5X4 = 47, - BGRA8_SRGB = 48, - DXT1_SRGB = 49, - DXT23_SRGB = 50, - DXT45_SRGB = 51, - BC7U_SRGB = 52, - ASTC_2D_4X4_SRGB = 53, - ASTC_2D_8X8_SRGB = 54, - ASTC_2D_8X5_SRGB = 55, - ASTC_2D_5X4_SRGB = 56, - - MaxColorFormat, - - // Depth formats - Z32F = 57, - Z16 = 58, - - MaxDepthFormat, - - // DepthStencil formats - Z24S8 = 59, - S8Z24 = 60, - Z32FS8 = 61, - - MaxDepthStencilFormat, - - Max = MaxDepthStencilFormat, - Invalid = 255, - }; - - static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); - - enum class ComponentType { - Invalid = 0, - SNorm = 1, - UNorm = 2, - SInt = 3, - UInt = 4, - Float = 5, - }; - - enum class SurfaceType { - ColorTexture = 0, - Depth = 1, - DepthStencil = 2, - Fill = 3, - Invalid = 4, - }; - - enum class SurfaceTarget { - Texture1D, - Texture2D, - Texture3D, - Texture1DArray, - Texture2DArray, - TextureCubemap, - }; - - static SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type) { - switch (texture_type) { - case Tegra::Texture::TextureType::Texture1D: - return SurfaceTarget::Texture1D; - case Tegra::Texture::TextureType::Texture2D: - case Tegra::Texture::TextureType::Texture2DNoMipmap: - return SurfaceTarget::Texture2D; - case Tegra::Texture::TextureType::Texture3D: - return SurfaceTarget::Texture3D; - case Tegra::Texture::TextureType::TextureCubemap: - return SurfaceTarget::TextureCubemap; - case Tegra::Texture::TextureType::Texture1DArray: - return SurfaceTarget::Texture1DArray; - case Tegra::Texture::TextureType::Texture2DArray: - return SurfaceTarget::Texture2DArray; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type)); - UNREACHABLE(); - return SurfaceTarget::Texture2D; - } - } +using SurfaceTarget = VideoCore::Surface::SurfaceTarget; +using SurfaceType = VideoCore::Surface::SurfaceType; +using PixelFormat = VideoCore::Surface::PixelFormat; +using ComponentType = VideoCore::Surface::ComponentType; +struct SurfaceParams { static std::string SurfaceTargetName(SurfaceTarget target) { switch (target) { case SurfaceTarget::Texture1D: @@ -170,6 +49,8 @@ struct SurfaceParams { return "Texture2DArray"; case SurfaceTarget::TextureCubemap: return "TextureCubemap"; + case SurfaceTarget::TextureCubeArray: + return "TextureCubeArray"; default: LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); UNREACHABLE(); @@ -177,664 +58,12 @@ struct SurfaceParams { } } - static bool SurfaceTargetIsLayered(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture2D: - case SurfaceTarget::Texture3D: - return false; - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - return true; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); - UNREACHABLE(); - return false; - } - } - - /** - * Gets the compression factor for the specified PixelFormat. This applies to just the - * "compressed width" and "compressed height", not the overall compression factor of a - * compressed image. This is used for maintaining proper surface sizes for compressed - * texture formats. - */ - static constexpr u32 GetCompressionFactor(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{ - 1, // ABGR8U - 1, // ABGR8S - 1, // ABGR8UI - 1, // B5G6R5U - 1, // A2B10G10R10U - 1, // A1B5G5R5U - 1, // R8U - 1, // R8UI - 1, // RGBA16F - 1, // RGBA16U - 1, // RGBA16UI - 1, // R11FG11FB10F - 1, // RGBA32UI - 4, // DXT1 - 4, // DXT23 - 4, // DXT45 - 4, // DXN1 - 4, // DXN2UNORM - 4, // DXN2SNORM - 4, // BC7U - 4, // BC6H_UF16 - 4, // BC6H_SF16 - 4, // ASTC_2D_4X4 - 1, // G8R8U - 1, // G8R8S - 1, // BGRA8 - 1, // RGBA32F - 1, // RG32F - 1, // R32F - 1, // R16F - 1, // R16U - 1, // R16S - 1, // R16UI - 1, // R16I - 1, // RG16 - 1, // RG16F - 1, // RG16UI - 1, // RG16I - 1, // RG16S - 1, // RGB32F - 1, // RGBA8_SRGB - 1, // RG8U - 1, // RG8S - 1, // RG32UI - 1, // R32UI - 4, // ASTC_2D_8X8 - 4, // ASTC_2D_8X5 - 4, // ASTC_2D_5X4 - 1, // BGRA8_SRGB - 4, // DXT1_SRGB - 4, // DXT23_SRGB - 4, // DXT45_SRGB - 4, // BC7U_SRGB - 4, // ASTC_2D_4X4_SRGB - 4, // ASTC_2D_8X8_SRGB - 4, // ASTC_2D_8X5_SRGB - 4, // ASTC_2D_5X4_SRGB - 1, // Z32F - 1, // Z16 - 1, // Z24S8 - 1, // S8Z24 - 1, // Z32FS8 - }}; - - ASSERT(static_cast<std::size_t>(format) < compression_factor_table.size()); - return compression_factor_table[static_cast<std::size_t>(format)]; - } - - static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ - 1, // ABGR8U - 1, // ABGR8S - 1, // ABGR8UI - 1, // B5G6R5U - 1, // A2B10G10R10U - 1, // A1B5G5R5U - 1, // R8U - 1, // R8UI - 1, // RGBA16F - 1, // RGBA16U - 1, // RGBA16UI - 1, // R11FG11FB10F - 1, // RGBA32UI - 4, // DXT1 - 4, // DXT23 - 4, // DXT45 - 4, // DXN1 - 4, // DXN2UNORM - 4, // DXN2SNORM - 4, // BC7U - 4, // BC6H_UF16 - 4, // BC6H_SF16 - 4, // ASTC_2D_4X4 - 1, // G8R8U - 1, // G8R8S - 1, // BGRA8 - 1, // RGBA32F - 1, // RG32F - 1, // R32F - 1, // R16F - 1, // R16U - 1, // R16S - 1, // R16UI - 1, // R16I - 1, // RG16 - 1, // RG16F - 1, // RG16UI - 1, // RG16I - 1, // RG16S - 1, // RGB32F - 1, // RGBA8_SRGB - 1, // RG8U - 1, // RG8S - 1, // RG32UI - 1, // R32UI - 8, // ASTC_2D_8X8 - 5, // ASTC_2D_8X5 - 4, // ASTC_2D_5X4 - 1, // BGRA8_SRGB - 4, // DXT1_SRGB - 4, // DXT23_SRGB - 4, // DXT45_SRGB - 4, // BC7U_SRGB - 4, // ASTC_2D_4X4_SRGB - 8, // ASTC_2D_8X8_SRGB - 5, // ASTC_2D_8X5_SRGB - 4, // ASTC_2D_5X4_SRGB - 1, // Z32F - 1, // Z16 - 1, // Z24S8 - 1, // S8Z24 - 1, // Z32FS8 - }}; - ASSERT(static_cast<std::size_t>(format) < block_height_table.size()); - return block_height_table[static_cast<std::size_t>(format)]; - } - - static constexpr u32 GetFormatBpp(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ - 32, // ABGR8U - 32, // ABGR8S - 32, // ABGR8UI - 16, // B5G6R5U - 32, // A2B10G10R10U - 16, // A1B5G5R5U - 8, // R8U - 8, // R8UI - 64, // RGBA16F - 64, // RGBA16U - 64, // RGBA16UI - 32, // R11FG11FB10F - 128, // RGBA32UI - 64, // DXT1 - 128, // DXT23 - 128, // DXT45 - 64, // DXN1 - 128, // DXN2UNORM - 128, // DXN2SNORM - 128, // BC7U - 128, // BC6H_UF16 - 128, // BC6H_SF16 - 32, // ASTC_2D_4X4 - 16, // G8R8U - 16, // G8R8S - 32, // BGRA8 - 128, // RGBA32F - 64, // RG32F - 32, // R32F - 16, // R16F - 16, // R16U - 16, // R16S - 16, // R16UI - 16, // R16I - 32, // RG16 - 32, // RG16F - 32, // RG16UI - 32, // RG16I - 32, // RG16S - 96, // RGB32F - 32, // RGBA8_SRGB - 16, // RG8U - 16, // RG8S - 64, // RG32UI - 32, // R32UI - 16, // ASTC_2D_8X8 - 16, // ASTC_2D_8X5 - 32, // ASTC_2D_5X4 - 32, // BGRA8_SRGB - 64, // DXT1_SRGB - 128, // DXT23_SRGB - 128, // DXT45_SRGB - 128, // BC7U - 32, // ASTC_2D_4X4_SRGB - 16, // ASTC_2D_8X8_SRGB - 16, // ASTC_2D_8X5_SRGB - 32, // ASTC_2D_5X4_SRGB - 32, // Z32F - 16, // Z16 - 32, // Z24S8 - 32, // S8Z24 - 64, // Z32FS8 - }}; - - ASSERT(static_cast<std::size_t>(format) < bpp_table.size()); - return bpp_table[static_cast<std::size_t>(format)]; - } - u32 GetFormatBpp() const { - return GetFormatBpp(pixel_format); - } - - static PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { - switch (format) { - case Tegra::DepthFormat::S8_Z24_UNORM: - return PixelFormat::S8Z24; - case Tegra::DepthFormat::Z24_S8_UNORM: - return PixelFormat::Z24S8; - case Tegra::DepthFormat::Z32_FLOAT: - return PixelFormat::Z32F; - case Tegra::DepthFormat::Z16_UNORM: - return PixelFormat::Z16; - case Tegra::DepthFormat::Z32_S8_X24_FLOAT: - return PixelFormat::Z32FS8; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - } - } - - static PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) { - switch (format) { - // TODO (Hexagon12): Converting SRGBA to RGBA is a hack and doesn't completely correct the - // gamma. - case Tegra::RenderTargetFormat::RGBA8_SRGB: - return PixelFormat::RGBA8_SRGB; - case Tegra::RenderTargetFormat::RGBA8_UNORM: - return PixelFormat::ABGR8U; - case Tegra::RenderTargetFormat::RGBA8_SNORM: - return PixelFormat::ABGR8S; - case Tegra::RenderTargetFormat::RGBA8_UINT: - return PixelFormat::ABGR8UI; - case Tegra::RenderTargetFormat::BGRA8_SRGB: - return PixelFormat::BGRA8_SRGB; - case Tegra::RenderTargetFormat::BGRA8_UNORM: - return PixelFormat::BGRA8; - case Tegra::RenderTargetFormat::RGB10_A2_UNORM: - return PixelFormat::A2B10G10R10U; - case Tegra::RenderTargetFormat::RGBA16_FLOAT: - return PixelFormat::RGBA16F; - case Tegra::RenderTargetFormat::RGBA16_UNORM: - return PixelFormat::RGBA16U; - case Tegra::RenderTargetFormat::RGBA16_UINT: - return PixelFormat::RGBA16UI; - case Tegra::RenderTargetFormat::RGBA32_FLOAT: - return PixelFormat::RGBA32F; - case Tegra::RenderTargetFormat::RG32_FLOAT: - return PixelFormat::RG32F; - case Tegra::RenderTargetFormat::R11G11B10_FLOAT: - return PixelFormat::R11FG11FB10F; - case Tegra::RenderTargetFormat::B5G6R5_UNORM: - return PixelFormat::B5G6R5U; - case Tegra::RenderTargetFormat::BGR5A1_UNORM: - return PixelFormat::A1B5G5R5U; - case Tegra::RenderTargetFormat::RGBA32_UINT: - return PixelFormat::RGBA32UI; - case Tegra::RenderTargetFormat::R8_UNORM: - return PixelFormat::R8U; - case Tegra::RenderTargetFormat::R8_UINT: - return PixelFormat::R8UI; - case Tegra::RenderTargetFormat::RG16_FLOAT: - return PixelFormat::RG16F; - case Tegra::RenderTargetFormat::RG16_UINT: - return PixelFormat::RG16UI; - case Tegra::RenderTargetFormat::RG16_SINT: - return PixelFormat::RG16I; - case Tegra::RenderTargetFormat::RG16_UNORM: - return PixelFormat::RG16; - case Tegra::RenderTargetFormat::RG16_SNORM: - return PixelFormat::RG16S; - case Tegra::RenderTargetFormat::RG8_UNORM: - return PixelFormat::RG8U; - case Tegra::RenderTargetFormat::RG8_SNORM: - return PixelFormat::RG8S; - case Tegra::RenderTargetFormat::R16_FLOAT: - return PixelFormat::R16F; - case Tegra::RenderTargetFormat::R16_UNORM: - return PixelFormat::R16U; - case Tegra::RenderTargetFormat::R16_SNORM: - return PixelFormat::R16S; - case Tegra::RenderTargetFormat::R16_UINT: - return PixelFormat::R16UI; - case Tegra::RenderTargetFormat::R16_SINT: - return PixelFormat::R16I; - case Tegra::RenderTargetFormat::R32_FLOAT: - return PixelFormat::R32F; - case Tegra::RenderTargetFormat::R32_UINT: - return PixelFormat::R32UI; - case Tegra::RenderTargetFormat::RG32_UINT: - return PixelFormat::RG32UI; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - } - } - - static PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, - Tegra::Texture::ComponentType component_type, - bool is_srgb) { - // TODO(Subv): Properly implement this - switch (format) { - case Tegra::Texture::TextureFormat::A8R8G8B8: - if (is_srgb) { - return PixelFormat::RGBA8_SRGB; - } - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::ABGR8U; - case Tegra::Texture::ComponentType::SNORM: - return PixelFormat::ABGR8S; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::ABGR8UI; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::B5G6R5: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::B5G6R5U; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::A2B10G10R10: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::A2B10G10R10U; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::A1B5G5R5: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::A1B5G5R5U; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R8: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::R8U; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::R8UI; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::G8R8: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::G8R8U; - case Tegra::Texture::ComponentType::SNORM: - return PixelFormat::G8R8S; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R16_G16_B16_A16: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::RGBA16U; - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::RGBA16F; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::BF10GF11RF11: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::R11FG11FB10F; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R32_G32_B32_A32: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::RGBA32F; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::RGBA32UI; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R32_G32: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::RG32F; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::RG32UI; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R32_G32_B32: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::RGB32F; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R16: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::R16F; - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::R16U; - case Tegra::Texture::ComponentType::SNORM: - return PixelFormat::R16S; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::R16UI; - case Tegra::Texture::ComponentType::SINT: - return PixelFormat::R16I; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::R32: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::R32F; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::R32UI; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::ZF32: - return PixelFormat::Z32F; - case Tegra::Texture::TextureFormat::Z16: - return PixelFormat::Z16; - case Tegra::Texture::TextureFormat::Z24S8: - return PixelFormat::Z24S8; - case Tegra::Texture::TextureFormat::DXT1: - return is_srgb ? PixelFormat::DXT1_SRGB : PixelFormat::DXT1; - case Tegra::Texture::TextureFormat::DXT23: - return is_srgb ? PixelFormat::DXT23_SRGB : PixelFormat::DXT23; - case Tegra::Texture::TextureFormat::DXT45: - return is_srgb ? PixelFormat::DXT45_SRGB : PixelFormat::DXT45; - case Tegra::Texture::TextureFormat::DXN1: - return PixelFormat::DXN1; - case Tegra::Texture::TextureFormat::DXN2: - switch (component_type) { - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::DXN2UNORM; - case Tegra::Texture::ComponentType::SNORM: - return PixelFormat::DXN2SNORM; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - case Tegra::Texture::TextureFormat::BC7U: - return is_srgb ? PixelFormat::BC7U_SRGB : PixelFormat::BC7U; - case Tegra::Texture::TextureFormat::BC6H_UF16: - return PixelFormat::BC6H_UF16; - case Tegra::Texture::TextureFormat::BC6H_SF16: - return PixelFormat::BC6H_SF16; - case Tegra::Texture::TextureFormat::ASTC_2D_4X4: - return is_srgb ? PixelFormat::ASTC_2D_4X4_SRGB : PixelFormat::ASTC_2D_4X4; - case Tegra::Texture::TextureFormat::ASTC_2D_5X4: - return is_srgb ? PixelFormat::ASTC_2D_5X4_SRGB : PixelFormat::ASTC_2D_5X4; - case Tegra::Texture::TextureFormat::ASTC_2D_8X8: - return is_srgb ? PixelFormat::ASTC_2D_8X8_SRGB : PixelFormat::ASTC_2D_8X8; - case Tegra::Texture::TextureFormat::ASTC_2D_8X5: - return is_srgb ? PixelFormat::ASTC_2D_8X5_SRGB : PixelFormat::ASTC_2D_8X5; - case Tegra::Texture::TextureFormat::R16_G16: - switch (component_type) { - case Tegra::Texture::ComponentType::FLOAT: - return PixelFormat::RG16F; - case Tegra::Texture::ComponentType::UNORM: - return PixelFormat::RG16; - case Tegra::Texture::ComponentType::SNORM: - return PixelFormat::RG16S; - case Tegra::Texture::ComponentType::UINT: - return PixelFormat::RG16UI; - case Tegra::Texture::ComponentType::SINT: - return PixelFormat::RG16I; - } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", - static_cast<u32>(component_type)); - UNREACHABLE(); - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", - static_cast<u32>(format), static_cast<u32>(component_type)); - UNREACHABLE(); - } - } - - static ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) { - // TODO(Subv): Implement more component types - switch (type) { - case Tegra::Texture::ComponentType::UNORM: - return ComponentType::UNorm; - case Tegra::Texture::ComponentType::FLOAT: - return ComponentType::Float; - case Tegra::Texture::ComponentType::SNORM: - return ComponentType::SNorm; - case Tegra::Texture::ComponentType::UINT: - return ComponentType::UInt; - case Tegra::Texture::ComponentType::SINT: - return ComponentType::SInt; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented component type={}", static_cast<u32>(type)); - UNREACHABLE(); - } - } - - static ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format) { - // TODO(Subv): Implement more render targets - switch (format) { - case Tegra::RenderTargetFormat::RGBA8_UNORM: - case Tegra::RenderTargetFormat::RGBA8_SRGB: - case Tegra::RenderTargetFormat::BGRA8_UNORM: - case Tegra::RenderTargetFormat::BGRA8_SRGB: - case Tegra::RenderTargetFormat::RGB10_A2_UNORM: - case Tegra::RenderTargetFormat::R8_UNORM: - case Tegra::RenderTargetFormat::RG16_UNORM: - case Tegra::RenderTargetFormat::R16_UNORM: - case Tegra::RenderTargetFormat::B5G6R5_UNORM: - case Tegra::RenderTargetFormat::BGR5A1_UNORM: - case Tegra::RenderTargetFormat::RG8_UNORM: - case Tegra::RenderTargetFormat::RGBA16_UNORM: - return ComponentType::UNorm; - case Tegra::RenderTargetFormat::RGBA8_SNORM: - case Tegra::RenderTargetFormat::RG16_SNORM: - case Tegra::RenderTargetFormat::R16_SNORM: - case Tegra::RenderTargetFormat::RG8_SNORM: - return ComponentType::SNorm; - case Tegra::RenderTargetFormat::RGBA16_FLOAT: - case Tegra::RenderTargetFormat::R11G11B10_FLOAT: - case Tegra::RenderTargetFormat::RGBA32_FLOAT: - case Tegra::RenderTargetFormat::RG32_FLOAT: - case Tegra::RenderTargetFormat::RG16_FLOAT: - case Tegra::RenderTargetFormat::R16_FLOAT: - case Tegra::RenderTargetFormat::R32_FLOAT: - return ComponentType::Float; - case Tegra::RenderTargetFormat::RGBA32_UINT: - case Tegra::RenderTargetFormat::RGBA16_UINT: - case Tegra::RenderTargetFormat::RG16_UINT: - case Tegra::RenderTargetFormat::R8_UINT: - case Tegra::RenderTargetFormat::R16_UINT: - case Tegra::RenderTargetFormat::RG32_UINT: - case Tegra::RenderTargetFormat::R32_UINT: - case Tegra::RenderTargetFormat::RGBA8_UINT: - return ComponentType::UInt; - case Tegra::RenderTargetFormat::RG16_SINT: - case Tegra::RenderTargetFormat::R16_SINT: - return ComponentType::SInt; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - } - } - - static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) { - switch (format) { - case Tegra::FramebufferConfig::PixelFormat::ABGR8: - return PixelFormat::ABGR8U; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - } - } - - static ComponentType ComponentTypeFromDepthFormat(Tegra::DepthFormat format) { - switch (format) { - case Tegra::DepthFormat::Z16_UNORM: - case Tegra::DepthFormat::S8_Z24_UNORM: - case Tegra::DepthFormat::Z24_S8_UNORM: - return ComponentType::UNorm; - case Tegra::DepthFormat::Z32_FLOAT: - case Tegra::DepthFormat::Z32_S8_X24_FLOAT: - return ComponentType::Float; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - } - } - - static SurfaceType GetFormatType(PixelFormat pixel_format) { - if (static_cast<std::size_t>(pixel_format) < - static_cast<std::size_t>(PixelFormat::MaxColorFormat)) { - return SurfaceType::ColorTexture; - } - - if (static_cast<std::size_t>(pixel_format) < - static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) { - return SurfaceType::Depth; - } - - if (static_cast<std::size_t>(pixel_format) < - static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) { - return SurfaceType::DepthStencil; - } - - // TODO(Subv): Implement the other formats - ASSERT(false); - - return SurfaceType::Invalid; - } - - /// Returns the sizer in bytes of the specified pixel format - static constexpr u32 GetBytesPerPixel(PixelFormat pixel_format) { - if (pixel_format == SurfaceParams::PixelFormat::Invalid) { - return 0; - } - return GetFormatBpp(pixel_format) / CHAR_BIT; + return VideoCore::Surface::GetFormatBpp(pixel_format); } /// Returns the rectangle corresponding to this surface - MathUtil::Rectangle<u32> GetRect() const; + MathUtil::Rectangle<u32> GetRect(u32 mip_level = 0) const; /// Returns the total size of this surface in bytes, adjusted for compression std::size_t SizeInBytesRaw(bool ignore_tiled = false) const { @@ -865,7 +94,7 @@ struct SurfaceParams { /// Returns the exact size of memory occupied by the texture in VRAM, including mipmaps. std::size_t MemorySize() const { - std::size_t size = InnerMemorySize(is_layered); + std::size_t size = InnerMemorySize(false, is_layered); if (is_layered) return size * depth; return size; @@ -874,12 +103,78 @@ struct SurfaceParams { /// Returns the exact size of the memory occupied by a layer in a texture in VRAM, including /// mipmaps. std::size_t LayerMemorySize() const { - return InnerMemorySize(true); + return InnerMemorySize(false, true); } /// Returns the size of a layer of this surface in OpenGL. - std::size_t LayerSizeGL() const { - return SizeInBytesRaw(true) / depth; + std::size_t LayerSizeGL(u32 mip_level) const { + return InnerMipmapMemorySize(mip_level, true, is_layered, false); + } + + std::size_t GetMipmapSizeGL(u32 mip_level, bool ignore_compressed = true) const { + std::size_t size = InnerMipmapMemorySize(mip_level, true, is_layered, ignore_compressed); + if (is_layered) + return size * depth; + return size; + } + + std::size_t GetMipmapLevelOffset(u32 mip_level) const { + std::size_t offset = 0; + for (u32 i = 0; i < mip_level; i++) + offset += InnerMipmapMemorySize(i, false, is_layered); + return offset; + } + + std::size_t GetMipmapLevelOffsetGL(u32 mip_level) const { + std::size_t offset = 0; + for (u32 i = 0; i < mip_level; i++) + offset += InnerMipmapMemorySize(i, true, is_layered); + return offset; + } + + u32 MipWidth(u32 mip_level) const { + return std::max(1U, width >> mip_level); + } + + u32 MipHeight(u32 mip_level) const { + return std::max(1U, height >> mip_level); + } + + u32 MipDepth(u32 mip_level) const { + return is_layered ? depth : std::max(1U, depth >> mip_level); + } + + // Auto block resizing algorithm from: + // https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_miptree.c + u32 MipBlockHeight(u32 mip_level) const { + if (mip_level == 0) + return block_height; + u32 alt_height = MipHeight(mip_level); + u32 h = GetDefaultBlockHeight(pixel_format); + u32 blocks_in_y = (alt_height + h - 1) / h; + u32 bh = 16; + while (bh > 1 && blocks_in_y <= bh * 4) { + bh >>= 1; + } + return bh; + } + + u32 MipBlockDepth(u32 mip_level) const { + if (mip_level == 0) + return block_depth; + if (is_layered) + return 1; + u32 depth = MipDepth(mip_level); + u32 bd = 32; + while (bd > 1 && depth * 2 <= bd) { + bd >>= 1; + } + if (bd == 32) { + u32 bh = MipBlockHeight(mip_level); + if (bh >= 4) + return 16; + } + return bd; } /// Creates SurfaceParams from a texture configuration @@ -901,9 +196,15 @@ struct SurfaceParams { /// Checks if surfaces are compatible for caching bool IsCompatibleSurface(const SurfaceParams& other) const { - return std::tie(pixel_format, type, width, height, target, depth) == - std::tie(other.pixel_format, other.type, other.width, other.height, other.target, - other.depth); + if (std::tie(pixel_format, type, width, height, target, depth, is_tiled) == + std::tie(other.pixel_format, other.type, other.width, other.height, other.target, + other.depth, other.is_tiled)) { + if (!is_tiled) + return true; + return std::tie(block_height, block_depth, tile_width_spacing) == + std::tie(other.block_height, other.block_depth, other.tile_width_spacing); + } + return false; } /// Initializes parameters for caching, should be called after everything has been initialized @@ -913,6 +214,7 @@ struct SurfaceParams { u32 block_width; u32 block_height; u32 block_depth; + u32 tile_width_spacing; PixelFormat pixel_format; ComponentType component_type; SurfaceType type; @@ -940,7 +242,10 @@ struct SurfaceParams { } rt; private: - std::size_t InnerMemorySize(bool layer_only = false) const; + std::size_t InnerMipmapMemorySize(u32 mip_level, bool force_gl = false, bool layer_only = false, + bool uncompressed = false) const; + std::size_t InnerMemorySize(bool force_gl = false, bool layer_only = false, + bool uncompressed = false) const; }; }; // namespace OpenGL @@ -966,6 +271,8 @@ struct hash<SurfaceReserveKey> { namespace OpenGL { +class RasterizerOpenGL; + class CachedSurface final : public RasterizerCacheObject { public: CachedSurface(const SurfaceParams& params); @@ -986,10 +293,31 @@ public: return texture; } + const OGLTexture& TextureLayer() { + if (params.is_layered) { + return Texture(); + } + EnsureTextureView(); + return texture_view; + } + GLenum Target() const { return gl_target; } + GLenum TargetLayer() const { + using VideoCore::Surface::SurfaceTarget; + switch (params.target) { + case SurfaceTarget::Texture1D: + return GL_TEXTURE_1D_ARRAY; + case SurfaceTarget::Texture2D: + return GL_TEXTURE_2D_ARRAY; + case SurfaceTarget::TextureCubemap: + return GL_TEXTURE_CUBE_MAP_ARRAY; + } + return Target(); + } + const SurfaceParams& GetSurfaceParams() const { return params; } @@ -1002,16 +330,23 @@ public: void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle); private: + void UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, GLuint draw_fb_handle); + + void EnsureTextureView(); + OGLTexture texture; - std::vector<u8> gl_buffer; - SurfaceParams params; - GLenum gl_target; - std::size_t cached_size_in_bytes; + OGLTexture texture_view; + std::vector<std::vector<u8>> gl_buffer; + SurfaceParams params{}; + GLenum gl_target{}; + GLenum gl_internal_format{}; + bool gl_is_compressed{}; + std::size_t cached_size_in_bytes{}; }; class RasterizerCacheOpenGL final : public RasterizerCache<Surface> { public: - RasterizerCacheOpenGL(); + explicit RasterizerCacheOpenGL(RasterizerOpenGL& rasterizer); /// Get a surface based on the texture configuration Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config, @@ -1048,6 +383,7 @@ private: /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface); + void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface); /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have /// previously been used. This is to prevent surfaces from being constantly created and diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp new file mode 100644 index 000000000..c17d5ac00 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -0,0 +1,186 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <utility> +#include <glad/glad.h> +#include "common/common_types.h" +#include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_state.h" + +MICROPROFILE_DEFINE(OpenGL_ResourceCreation, "OpenGL", "Resource Creation", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_RGB(128, 128, 192)); + +namespace OpenGL { + +void OGLTexture::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenTextures(1, &handle); +} + +void OGLTexture::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteTextures(1, &handle); + OpenGLState::GetCurState().UnbindTexture(handle).Apply(); + handle = 0; +} + +void OGLSampler::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenSamplers(1, &handle); +} + +void OGLSampler::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteSamplers(1, &handle); + OpenGLState::GetCurState().ResetSampler(handle).Apply(); + handle = 0; +} + +void OGLShader::Create(const char* source, GLenum type) { + if (handle != 0) + return; + if (source == nullptr) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + handle = GLShader::LoadShader(source, type); +} + +void OGLShader::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteShader(handle); + handle = 0; +} + +void OGLProgram::CreateFromSource(const char* vert_shader, const char* geo_shader, + const char* frag_shader, bool separable_program) { + OGLShader vert, geo, frag; + if (vert_shader) + vert.Create(vert_shader, GL_VERTEX_SHADER); + if (geo_shader) + geo.Create(geo_shader, GL_GEOMETRY_SHADER); + if (frag_shader) + frag.Create(frag_shader, GL_FRAGMENT_SHADER); + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + Create(separable_program, vert.handle, geo.handle, frag.handle); +} + +void OGLProgram::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteProgram(handle); + OpenGLState::GetCurState().ResetProgram(handle).Apply(); + handle = 0; +} + +void OGLPipeline::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenProgramPipelines(1, &handle); +} + +void OGLPipeline::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteProgramPipelines(1, &handle); + OpenGLState::GetCurState().ResetPipeline(handle).Apply(); + handle = 0; +} + +void OGLBuffer::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenBuffers(1, &handle); +} + +void OGLBuffer::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteBuffers(1, &handle); + OpenGLState::GetCurState().ResetBuffer(handle).Apply(); + handle = 0; +} + +void OGLSync::Create() { + if (handle != 0) + return; + + // Don't profile here, this one is expected to happen ingame. + handle = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); +} + +void OGLSync::Release() { + if (handle == 0) + return; + + // Don't profile here, this one is expected to happen ingame. + glDeleteSync(handle); + handle = 0; +} + +void OGLVertexArray::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenVertexArrays(1, &handle); +} + +void OGLVertexArray::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteVertexArrays(1, &handle); + OpenGLState::GetCurState().ResetVertexArray(handle).Apply(); + handle = 0; +} + +void OGLFramebuffer::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glGenFramebuffers(1, &handle); +} + +void OGLFramebuffer::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteFramebuffers(1, &handle); + OpenGLState::GetCurState().ResetFramebuffer(handle).Apply(); + handle = 0; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 3bc1b83b5..e33f1e973 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -8,7 +8,6 @@ #include <glad/glad.h> #include "common/common_types.h" #include "video_core/renderer_opengl/gl_shader_util.h" -#include "video_core/renderer_opengl/gl_state.h" namespace OpenGL { @@ -29,20 +28,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenTextures(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteTextures(1, &handle); - OpenGLState::GetCurState().UnbindTexture(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -64,20 +53,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenSamplers(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteSamplers(1, &handle); - OpenGLState::GetCurState().ResetSampler(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -98,20 +77,9 @@ public: return *this; } - void Create(const char* source, GLenum type) { - if (handle != 0) - return; - if (source == nullptr) - return; - handle = GLShader::LoadShader(source, type); - } + void Create(const char* source, GLenum type); - void Release() { - if (handle == 0) - return; - glDeleteShader(handle); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -141,25 +109,10 @@ public: /// Creates a new internal OpenGL resource and stores the handle void CreateFromSource(const char* vert_shader, const char* geo_shader, const char* frag_shader, - bool separable_program = false) { - OGLShader vert, geo, frag; - if (vert_shader) - vert.Create(vert_shader, GL_VERTEX_SHADER); - if (geo_shader) - geo.Create(geo_shader, GL_GEOMETRY_SHADER); - if (frag_shader) - frag.Create(frag_shader, GL_FRAGMENT_SHADER); - Create(separable_program, vert.handle, geo.handle, frag.handle); - } + bool separable_program = false); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteProgram(handle); - OpenGLState::GetCurState().ResetProgram(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -178,20 +131,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenProgramPipelines(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteProgramPipelines(1, &handle); - OpenGLState::GetCurState().ResetPipeline(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -213,20 +156,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenBuffers(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteBuffers(1, &handle); - OpenGLState::GetCurState().ResetBuffer(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -247,19 +180,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - handle = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteSync(handle); - handle = 0; - } + void Release(); GLsync handle = 0; }; @@ -281,20 +205,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenVertexArrays(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteVertexArrays(1, &handle); - OpenGLState::GetCurState().ResetVertexArray(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; @@ -316,20 +230,10 @@ public: } /// Creates a new internal OpenGL resource and stores the handle - void Create() { - if (handle != 0) - return; - glGenFramebuffers(1, &handle); - } + void Create(); /// Deletes the internal OpenGL resource - void Release() { - if (handle == 0) - return; - glDeleteFramebuffers(1, &handle); - OpenGLState::GetCurState().ResetFramebuffer(handle).Apply(); - handle = 0; - } + void Release(); GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 1a03a677f..aea6bf1af 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -2,13 +2,16 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <boost/functional/hash.hpp> #include "common/assert.h" +#include "common/hash.h" #include "core/core.h" #include "core/memory.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -#include "video_core/utils.h" +#include "video_core/renderer_opengl/utils.h" namespace OpenGL { @@ -65,14 +68,17 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type) // stage here. setup.SetProgramB(GetShaderCode(GetShaderAddress(Maxwell::ShaderProgram::VertexB))); case Maxwell::ShaderProgram::VertexB: + CalculateProperties(); program_result = GLShader::GenerateVertexShader(setup); gl_type = GL_VERTEX_SHADER; break; case Maxwell::ShaderProgram::Geometry: + CalculateProperties(); program_result = GLShader::GenerateGeometryShader(setup); gl_type = GL_GEOMETRY_SHADER; break; case Maxwell::ShaderProgram::Fragment: + CalculateProperties(); program_result = GLShader::GenerateFragmentShader(setup); gl_type = GL_FRAGMENT_SHADER; break; @@ -83,13 +89,14 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type) } entries = program_result.second; + shader_length = entries.shader_length; if (program_type != Maxwell::ShaderProgram::Geometry) { OGLShader shader; shader.Create(program_result.first.c_str(), gl_type); program.Create(true, shader.handle); SetShaderUniformBlockBindings(program.handle); - VideoCore::LabelGLObject(GL_PROGRAM, program.handle, addr); + LabelGLObject(GL_PROGRAM, program.handle, addr); } else { // Store shader's code to lazily build it on draw geometry_programs.code = program_result.first; @@ -120,20 +127,66 @@ GLint CachedShader::GetUniformLocation(const GLShader::SamplerEntry& sampler) { } GLuint CachedShader::LazyGeometryProgram(OGLProgram& target_program, - const std::string& glsl_topology, + const std::string& glsl_topology, u32 max_vertices, const std::string& debug_name) { if (target_program.handle != 0) { return target_program.handle; } - const std::string source{geometry_programs.code + "layout (" + glsl_topology + ") in;\n"}; + std::string source = "#version 430 core\n"; + source += "layout (" + glsl_topology + ") in;\n"; + source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; + source += geometry_programs.code; + OGLShader shader; shader.Create(source.c_str(), GL_GEOMETRY_SHADER); target_program.Create(true, shader.handle); SetShaderUniformBlockBindings(target_program.handle); - VideoCore::LabelGLObject(GL_PROGRAM, target_program.handle, addr, debug_name); + LabelGLObject(GL_PROGRAM, target_program.handle, addr, debug_name); return target_program.handle; }; +static bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { + // sched instructions appear once every 4 instructions. + static constexpr std::size_t SchedPeriod = 4; + const std::size_t absolute_offset = offset - main_offset; + return (absolute_offset % SchedPeriod) == 0; +} + +static std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { + constexpr std::size_t start_offset = 10; + std::size_t offset = start_offset; + std::size_t size = start_offset * sizeof(u64); + while (offset < program.size()) { + const u64 inst = program[offset]; + if (!IsSchedInstruction(offset, start_offset)) { + if (inst == 0 || (inst >> 52) == 0x50b) { + break; + } + } + size += sizeof(inst); + offset++; + } + return size; +} + +void CachedShader::CalculateProperties() { + setup.program.real_size = CalculateProgramSize(setup.program.code); + setup.program.real_size_b = 0; + setup.program.unique_identifier = Common::CityHash64( + reinterpret_cast<const char*>(setup.program.code.data()), setup.program.real_size); + if (program_type == Maxwell::ShaderProgram::VertexA) { + std::size_t seed = 0; + boost::hash_combine(seed, setup.program.unique_identifier); + setup.program.real_size_b = CalculateProgramSize(setup.program.code_b); + const u64 identifier_b = Common::CityHash64( + reinterpret_cast<const char*>(setup.program.code_b.data()), setup.program.real_size_b); + boost::hash_combine(seed, identifier_b); + setup.program.unique_identifier = static_cast<u64>(seed); + } +} + +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer) : RasterizerCache{rasterizer} {} + Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const VAddr program_addr{GetShaderAddress(program)}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index a210f1731..de3671acf 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -16,6 +16,8 @@ namespace OpenGL { class CachedShader; +class RasterizerOpenGL; + using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; @@ -28,7 +30,7 @@ public: } std::size_t GetSizeInBytes() const override { - return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64); + return shader_length; } // We do not have to flush this cache as things in it are never modified by us. @@ -46,24 +48,26 @@ public: } switch (primitive_mode) { case GL_POINTS: - return LazyGeometryProgram(geometry_programs.points, "points", "ShaderPoints"); + return LazyGeometryProgram(geometry_programs.points, "points", 1, "ShaderPoints"); case GL_LINES: case GL_LINE_STRIP: - return LazyGeometryProgram(geometry_programs.lines, "lines", "ShaderLines"); + return LazyGeometryProgram(geometry_programs.lines, "lines", 2, "ShaderLines"); case GL_LINES_ADJACENCY: case GL_LINE_STRIP_ADJACENCY: - return LazyGeometryProgram(geometry_programs.lines_adjacency, "lines_adjacency", + return LazyGeometryProgram(geometry_programs.lines_adjacency, "lines_adjacency", 4, "ShaderLinesAdjacency"); case GL_TRIANGLES: case GL_TRIANGLE_STRIP: case GL_TRIANGLE_FAN: - return LazyGeometryProgram(geometry_programs.triangles, "triangles", "ShaderTriangles"); + return LazyGeometryProgram(geometry_programs.triangles, "triangles", 3, + "ShaderTriangles"); case GL_TRIANGLES_ADJACENCY: case GL_TRIANGLE_STRIP_ADJACENCY: return LazyGeometryProgram(geometry_programs.triangles_adjacency, "triangles_adjacency", - "ShaderLines"); + 6, "ShaderTrianglesAdjacency"); default: UNREACHABLE_MSG("Unknown primitive mode."); + return LazyGeometryProgram(geometry_programs.points, "points", 1, "ShaderPoints"); } } @@ -76,9 +80,12 @@ public: private: /// Generates a geometry shader or returns one that already exists. GLuint LazyGeometryProgram(OGLProgram& target_program, const std::string& glsl_topology, - const std::string& debug_name); + u32 max_vertices, const std::string& debug_name); + + void CalculateProperties(); VAddr addr; + std::size_t shader_length; Maxwell::ShaderProgram program_type; GLShader::ShaderSetup setup; GLShader::ShaderEntries entries; @@ -104,6 +111,8 @@ private: class ShaderCacheOpenGL final : public RasterizerCache<Shader> { public: + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer); + /// Gets the current specified shader stage program Shader GetStageProgram(Maxwell::ShaderProgram program); }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index dcf6941b0..1bb09e61b 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -3,12 +3,12 @@ // Refer to the license.txt file included. #include <map> +#include <optional> #include <set> #include <string> #include <string_view> #include <unordered_set> -#include <boost/optional.hpp> #include <fmt/format.h> #include "common/assert.h" @@ -34,11 +34,30 @@ constexpr u32 PROGRAM_HEADER_SIZE = sizeof(Tegra::Shader::Header); constexpr u32 MAX_GEOMETRY_BUFFERS = 6; constexpr u32 MAX_ATTRIBUTES = 0x100; // Size in vec4s, this value is untested +static const char* INTERNAL_FLAG_NAMES[] = {"zero_flag", "sign_flag", "carry_flag", + "overflow_flag"}; + +enum class InternalFlag : u64 { + ZeroFlag = 0, + SignFlag = 1, + CarryFlag = 2, + OverflowFlag = 3, + Amount +}; + class DecompileFail : public std::runtime_error { public: using std::runtime_error::runtime_error; }; +/// Generates code to use for a swizzle operation. +static std::string GetSwizzle(u64 elem) { + ASSERT(elem <= 3); + std::string swizzle = "."; + swizzle += "xyzw"[elem]; + return swizzle; +} + /// Translate topology static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { switch (topology) { @@ -49,8 +68,7 @@ static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { case Tegra::Shader::OutputTopology::TriangleStrip: return "triangle_strip"; default: - LOG_CRITICAL(Render_OpenGL, "Unknown output topology {}", static_cast<u32>(topology)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); return "points"; } } @@ -85,7 +103,8 @@ struct Subroutine { class ControlFlowAnalyzer { public: ControlFlowAnalyzer(const ProgramCode& program_code, u32 main_offset, const std::string& suffix) - : program_code(program_code) { + : program_code(program_code), shader_coverage_begin(main_offset), + shader_coverage_end(main_offset + 1) { // Recursively finds all subroutines. const Subroutine& program_main = AddSubroutine(main_offset, PROGRAM_END, suffix); @@ -97,10 +116,16 @@ public: return std::move(subroutines); } + std::size_t GetShaderLength() const { + return shader_coverage_end * sizeof(u64); + } + private: const ProgramCode& program_code; std::set<Subroutine> subroutines; std::map<std::pair<u32, u32>, ExitMethod> exit_method_map; + u32 shader_coverage_begin; + u32 shader_coverage_end; /// Adds and analyzes a new subroutine if it is not added yet. const Subroutine& AddSubroutine(u32 begin, u32 end, const std::string& suffix) { @@ -142,9 +167,12 @@ private: return exit_method; for (u32 offset = begin; offset != end && offset != PROGRAM_END; ++offset) { + shader_coverage_begin = std::min(shader_coverage_begin, offset); + shader_coverage_end = std::max(shader_coverage_end, offset + 1); + const Instruction instr = {program_code[offset]}; if (const auto opcode = OpCode::Decode(instr)) { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::EXIT: { // The EXIT instruction can be predicated, which means that the shader can // conditionally end on this instruction. We have to consider the case where the @@ -167,8 +195,8 @@ private: case OpCode::Id::SSY: case OpCode::Id::PBK: { // The SSY and PBK use a similar encoding as the BRA instruction. - ASSERT_MSG(instr.bra.constant_buffer == 0, - "Constant buffer branching is not supported"); + UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, + "Constant buffer branching is not supported"); const u32 target = offset + instr.bra.GetBranchTarget(); labels.insert(target); // Continue scanning for an exit method. @@ -181,14 +209,53 @@ private: } }; +template <typename T> +class ShaderScopedScope { +public: + explicit ShaderScopedScope(T& writer, std::string_view begin_expr, std::string end_expr) + : writer(writer), end_expr(std::move(end_expr)) { + + if (begin_expr.empty()) { + writer.AddLine('{'); + } else { + writer.AddExpression(begin_expr); + writer.AddLine(" {"); + } + ++writer.scope; + } + + ShaderScopedScope(const ShaderScopedScope&) = delete; + + ~ShaderScopedScope() { + --writer.scope; + if (end_expr.empty()) { + writer.AddLine('}'); + } else { + writer.AddExpression("} "); + writer.AddExpression(end_expr); + writer.AddLine(';'); + } + } + + ShaderScopedScope& operator=(const ShaderScopedScope&) = delete; + +private: + T& writer; + std::string end_expr; +}; + class ShaderWriter { public: - void AddLine(std::string_view text) { + void AddExpression(std::string_view text) { DEBUG_ASSERT(scope >= 0); if (!text.empty()) { AppendIndentation(); } shader_source += text; + } + + void AddLine(std::string_view text) { + AddExpression(text); AddNewLine(); } @@ -208,6 +275,11 @@ public: return std::move(shader_source); } + ShaderScopedScope<ShaderWriter> Scope(std::string_view begin_expr = {}, + std::string end_expr = {}) { + return ShaderScopedScope(*this, begin_expr, end_expr); + } + int scope = 0; private: @@ -258,14 +330,6 @@ private: const std::string& suffix; }; -enum class InternalFlag : u64 { - ZeroFlag = 0, - CarryFlag = 1, - OverflowFlag = 2, - NaNFlag = 3, - Amount -}; - /** * Used to manage shader registers that are emulated with GLSL. This class keeps track of the state * of all registers (e.g. whether they are currently being used as Floats or Integers), and @@ -283,6 +347,15 @@ public: BuildInputList(); } + void SetConditionalCodesFromExpression(const std::string& expresion) { + SetInternalFlag(InternalFlag::ZeroFlag, "(" + expresion + ") == 0"); + LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete."); + } + + void SetConditionalCodesFromRegister(const Register& reg, u64 dest_elem = 0) { + SetConditionalCodesFromExpression(GetRegister(reg, static_cast<u32>(dest_elem))); + } + /** * Returns code that does an integer size conversion for the specified size. * @param value Value to perform integer size conversion on. @@ -299,8 +372,8 @@ public: // Default - do nothing return value; default: - LOG_CRITICAL(HW_GPU, "Unimplemented conversion size {}", static_cast<u32>(size)); - UNREACHABLE(); + UNREACHABLE_MSG("Unimplemented conversion size: {}", static_cast<u32>(size)); + return value; } } @@ -337,14 +410,24 @@ public: * @param dest_num_components Number of components in the destination. * @param value_num_components Number of components in the value. * @param is_saturated Optional, when True, saturates the provided value. + * @param sets_cc Optional, when True, sets the corresponding values to the implemented + * condition flags. * @param dest_elem Optional, the destination element to use for the operation. */ void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value, u64 dest_num_components, u64 value_num_components, - bool is_saturated = false, u64 dest_elem = 0, bool precise = false) { - - SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value, - dest_num_components, value_num_components, dest_elem, precise); + bool is_saturated = false, bool sets_cc = false, u64 dest_elem = 0, + bool precise = false) { + const std::string clamped_value = is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value; + SetRegister(reg, elem, clamped_value, dest_num_components, value_num_components, dest_elem, + precise); + if (sets_cc) { + if (reg == Register::ZeroIndex) { + SetConditionalCodesFromExpression(clamped_value); + } else { + SetConditionalCodesFromRegister(reg, dest_elem); + } + } } /** @@ -355,24 +438,29 @@ public: * @param dest_num_components Number of components in the destination. * @param value_num_components Number of components in the value. * @param is_saturated Optional, when True, saturates the provided value. + * @param sets_cc Optional, when True, sets the corresponding values to the implemented + * condition flags. * @param dest_elem Optional, the destination element to use for the operation. * @param size Register size to use for conversion instructions. */ void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem, const std::string& value, u64 dest_num_components, u64 value_num_components, bool is_saturated = false, - u64 dest_elem = 0, Register::Size size = Register::Size::Word, - bool sets_cc = false) { - ASSERT_MSG(!is_saturated, "Unimplemented"); - + bool sets_cc = false, u64 dest_elem = 0, + Register::Size size = Register::Size::Word) { + UNIMPLEMENTED_IF(is_saturated); + const std::string final_value = ConvertIntegerSize(value, size); const std::string func{is_signed ? "intBitsToFloat" : "uintBitsToFloat"}; - SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')', - dest_num_components, value_num_components, dest_elem, false); + SetRegister(reg, elem, func + '(' + final_value + ')', dest_num_components, + value_num_components, dest_elem, false); if (sets_cc) { - const std::string zero_condition = "( " + ConvertIntegerSize(value, size) + " == 0 )"; - SetInternalFlag(InternalFlag::ZeroFlag, zero_condition); + if (reg == Register::ZeroIndex) { + SetConditionalCodesFromExpression(final_value); + } else { + SetConditionalCodesFromRegister(reg, dest_elem); + } } } @@ -391,7 +479,7 @@ public: Tegra::Shader::HalfMerge merge, u64 dest_num_components, u64 value_num_components, bool is_saturated = false, u64 dest_elem = 0) { - ASSERT_MSG(!is_saturated, "Unimplemented"); + UNIMPLEMENTED_IF(is_saturated); const std::string result = [&]() { switch (merge) { @@ -405,10 +493,10 @@ public: // pack. I couldn't test this on hardware but it shouldn't really matter since most // of the time when a Mrg_* flag is used both components will be mirrored. That // being said, it deserves a test. - return "((" + GetRegisterAsInteger(reg, 0, false) + + return "uintBitsToFloat((" + GetRegisterAsInteger(reg, 0, false) + " & 0xffff0000) | (packHalf2x16(" + value + ") & 0x0000ffff))"; case Tegra::Shader::HalfMerge::Mrg_H1: - return "((" + GetRegisterAsInteger(reg, 0, false) + + return "uintBitsToFloat((" + GetRegisterAsInteger(reg, 0, false) + " & 0x0000ffff) | (packHalf2x16(" + value + ") & 0xffff0000))"; default: UNREACHABLE(); @@ -430,7 +518,7 @@ public: */ void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute, const Tegra::Shader::IpaMode& input_mode, - boost::optional<Register> vertex = {}) { + std::optional<Register> vertex = {}) { const std::string dest = GetRegisterAsFloat(reg); const std::string src = GetInputAttribute(attribute, input_mode, vertex) + GetSwizzle(elem); shader.AddLine(dest + " = " + src + ';'); @@ -455,24 +543,25 @@ public: shader.AddLine("lmem[" + index + "] = " + func + '(' + value + ");"); } - std::string GetControlCode(const Tegra::Shader::ControlCode cc) const { + std::string GetConditionCode(const Tegra::Shader::ConditionCode cc) const { switch (cc) { - case Tegra::Shader::ControlCode::NEU: + case Tegra::Shader::ConditionCode::NEU: return "!(" + GetInternalFlag(InternalFlag::ZeroFlag) + ')'; default: - LOG_CRITICAL(HW_GPU, "Unimplemented Control Code {}", static_cast<u32>(cc)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented condition code: {}", static_cast<u32>(cc)); return "false"; } } - std::string GetInternalFlag(const InternalFlag ii) const { - const u32 code = static_cast<u32>(ii); - return "internalFlag_" + std::to_string(code) + suffix; + std::string GetInternalFlag(const InternalFlag flag) const { + const auto index = static_cast<u32>(flag); + ASSERT(index < static_cast<u32>(InternalFlag::Amount)); + + return std::string(INTERNAL_FLAG_NAMES[index]) + '_' + suffix; } - void SetInternalFlag(const InternalFlag ii, const std::string& value) const { - shader.AddLine(GetInternalFlag(ii) + " = " + value + ';'); + void SetInternalFlag(const InternalFlag flag, const std::string& value) const { + shader.AddLine(GetInternalFlag(flag) + " = " + value + ';'); } /** @@ -487,27 +576,43 @@ public: const Register& buf_reg) { const std::string dest = GetOutputAttribute(attribute); const std::string src = GetRegisterAsFloat(val_reg); + if (dest.empty()) + return; - if (!dest.empty()) { - // Can happen with unknown/unimplemented output attributes, in which case we ignore the - // instruction for now. - if (stage == Maxwell3D::Regs::ShaderStage::Geometry) { - // TODO(Rodrigo): nouveau sets some attributes after setting emitting a geometry - // shader. These instructions use a dirty register as buffer index. To avoid some - // drivers from complaining for the out of boundary writes, guard them. - const std::string buf_index{"min(" + GetRegisterAsInteger(buf_reg) + ", " + - std::to_string(MAX_GEOMETRY_BUFFERS - 1) + ')'}; - shader.AddLine("amem[" + buf_index + "][" + - std::to_string(static_cast<u32>(attribute)) + ']' + - GetSwizzle(elem) + " = " + src + ';'); - } else { - if (attribute == Attribute::Index::PointSize) { - fixed_pipeline_output_attributes_used.insert(attribute); - shader.AddLine(dest + " = " + src + ';'); - } else { - shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';'); - } - } + // Can happen with unknown/unimplemented output attributes, in which case we ignore the + // instruction for now. + if (stage == Maxwell3D::Regs::ShaderStage::Geometry) { + // TODO(Rodrigo): nouveau sets some attributes after setting emitting a geometry + // shader. These instructions use a dirty register as buffer index, to avoid some + // drivers from complaining about out of boundary writes, guard them. + const std::string buf_index{"((" + GetRegisterAsInteger(buf_reg) + ") % " + + std::to_string(MAX_GEOMETRY_BUFFERS) + ')'}; + shader.AddLine("amem[" + buf_index + "][" + + std::to_string(static_cast<u32>(attribute)) + ']' + GetSwizzle(elem) + + " = " + src + ';'); + return; + } + + switch (attribute) { + case Attribute::Index::ClipDistances0123: + case Attribute::Index::ClipDistances4567: { + const u64 index = (attribute == Attribute::Index::ClipDistances4567 ? 4 : 0) + elem; + UNIMPLEMENTED_IF_MSG( + ((header.vtg.clip_distances >> index) & 1) == 0, + "Shader is setting gl_ClipDistance{} without enabling it in the header", index); + + clip_distances[index] = true; + fixed_pipeline_output_attributes_used.insert(attribute); + shader.AddLine(dest + '[' + std::to_string(index) + "] = " + src + ';'); + break; + } + case Attribute::Index::PointSize: + fixed_pipeline_output_attributes_used.insert(attribute); + shader.AddLine(dest + " = " + src + ';'); + break; + default: + shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';'); + break; } } @@ -545,6 +650,7 @@ public: return "floatBitsToInt(" + value + ')'; } else { UNREACHABLE(); + return value; } } @@ -574,6 +680,11 @@ public: return used_samplers; } + /// Returns an array of the used clip distances. + const std::array<bool, Maxwell::NumClipDistances>& GetClipDistances() const { + return clip_distances; + } + /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if /// necessary. std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type, @@ -623,8 +734,8 @@ private: /// Generates declarations for internal flags. void GenerateInternalFlags() { - for (u32 ii = 0; ii < static_cast<u64>(InternalFlag::Amount); ii++) { - const InternalFlag code = static_cast<InternalFlag>(ii); + for (u32 flag = 0; flag < static_cast<u32>(InternalFlag::Amount); flag++) { + const InternalFlag code = static_cast<InternalFlag>(flag); declarations.AddLine("bool " + GetInternalFlag(code) + " = false;"); } declarations.AddNewLine(); @@ -727,12 +838,19 @@ private: void GenerateVertex() { if (stage != Maxwell3D::Regs::ShaderStage::Vertex) return; + bool clip_distances_declared = false; + declarations.AddLine("out gl_PerVertex {"); ++declarations.scope; declarations.AddLine("vec4 gl_Position;"); for (auto& o : fixed_pipeline_output_attributes_used) { if (o == Attribute::Index::PointSize) declarations.AddLine("float gl_PointSize;"); + if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 || + o == Attribute::Index::ClipDistances4567)) { + declarations.AddLine("float gl_ClipDistance[];"); + clip_distances_declared = true; + } } --declarations.scope; declarations.AddLine("};"); @@ -760,8 +878,7 @@ private: u64 dest_num_components, u64 value_num_components, u64 dest_elem, bool precise) { if (reg == Register::ZeroIndex) { - LOG_CRITICAL(HW_GPU, "Cannot set Register::ZeroIndex"); - UNREACHABLE(); + // Setting RZ is a nop in hardware. return; } @@ -776,14 +893,12 @@ private: } if (precise && stage != Maxwell3D::Regs::ShaderStage::Fragment) { - shader.AddLine('{'); - ++shader.scope; + const auto scope = shader.Scope(); + // This avoids optimizations of constant propagation and keeps the code as the original // Sadly using the precise keyword causes "linking" errors on fragment shaders. shader.AddLine("precise float tmp = " + src + ';'); shader.AddLine(dest + " = tmp;"); - --shader.scope; - shader.AddLine('}'); } else { shader.AddLine(dest + " = " + src + ';'); } @@ -807,10 +922,14 @@ private: /// Generates code representing an input attribute register. std::string GetInputAttribute(Attribute::Index attribute, const Tegra::Shader::IpaMode& input_mode, - boost::optional<Register> vertex = {}) { + std::optional<Register> vertex = {}) { auto GeometryPass = [&](const std::string& name) { if (stage == Maxwell3D::Regs::ShaderStage::Geometry && vertex) { - return "gs_" + name + '[' + GetRegisterAsInteger(vertex.value(), 0, false) + ']'; + // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games set + // an 0x80000000 index for those and the shader fails to build. Find out why this + // happens and what's its intent. + return "gs_" + name + '[' + GetRegisterAsInteger(*vertex, 0, false) + + " % MAX_VERTEX_INPUT]"; } return name; }; @@ -829,11 +948,12 @@ private: // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex); - return "vec4(0, 0, uintBitsToFloat(instance_id.x), uintBitsToFloat(gl_VertexID))"; + // Config pack's first value is instance_id. + return "vec4(0, 0, uintBitsToFloat(config_pack[0]), uintBitsToFloat(gl_VertexID))"; case Attribute::Index::FrontFacing: // TODO(Subv): Find out what the values are for the other elements. ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment); - return "vec4(0, 0, 0, uintBitsToFloat(gl_FrontFacing ? 1 : 0))"; + return "vec4(0, 0, 0, intBitsToFloat(gl_FrontFacing ? -1 : 0))"; default: const u32 index{static_cast<u32>(attribute) - static_cast<u32>(Attribute::Index::Attribute_0)}; @@ -842,16 +962,13 @@ private: if (declr_input_attribute.count(attribute) == 0) { declr_input_attribute[attribute] = input_mode; } else { - if (declr_input_attribute[attribute] != input_mode) { - LOG_CRITICAL(HW_GPU, "Same Input multiple input modes"); - UNREACHABLE(); - } + UNIMPLEMENTED_IF_MSG(declr_input_attribute[attribute] != input_mode, + "Multiple input modes for the same attribute"); } return GeometryPass("input_attribute_" + std::to_string(index)); } - LOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", static_cast<u32>(attribute)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); } return "vec4(0, 0, 0, 0)"; @@ -877,24 +994,20 @@ private: break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled Ipa InterpMode: {}", static_cast<u32>(interp_mode)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled IPA interp mode: {}", static_cast<u32>(interp_mode)); } } switch (sample_mode) { - case Tegra::Shader::IpaSampleMode::Centroid: { - // Note not implemented, it can be implemented with the "centroid " keyword in glsl; - LOG_CRITICAL(HW_GPU, "Ipa Sampler Mode: centroid, not implemented"); - UNREACHABLE(); + case Tegra::Shader::IpaSampleMode::Centroid: + // It can be implemented with the "centroid " keyword in glsl + UNIMPLEMENTED_MSG("Unimplemented IPA sampler mode centroid"); break; - } - case Tegra::Shader::IpaSampleMode::Default: { + case Tegra::Shader::IpaSampleMode::Default: // Default, n/a break; - } default: { - LOG_CRITICAL(HW_GPU, "Unhandled Ipa SampleMode: {}", static_cast<u32>(sample_mode)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented IPA sampler mode: {}", static_cast<u32>(sample_mode)); + break; } } return out; @@ -907,6 +1020,10 @@ private: return "gl_PointSize"; case Attribute::Index::Position: return "position"; + case Attribute::Index::ClipDistances0123: + case Attribute::Index::ClipDistances4567: { + return "gl_ClipDistance"; + } default: const u32 index{static_cast<u32>(attribute) - static_cast<u32>(Attribute::Index::Attribute_0)}; @@ -915,20 +1032,11 @@ private: return "output_attribute_" + std::to_string(index); } - LOG_CRITICAL(HW_GPU, "Unhandled output attribute: {}", index); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled output attribute={}", index); return {}; } } - /// Generates code to use for a swizzle operation. - static std::string GetSwizzle(u64 elem) { - ASSERT(elem <= 3); - std::string swizzle = "."; - swizzle += "xyzw"[elem]; - return swizzle; - } - ShaderWriter& shader; ShaderWriter& declarations; std::vector<GLSLRegister> regs; @@ -940,15 +1048,17 @@ private: const std::string& suffix; const Tegra::Shader::Header& header; std::unordered_set<Attribute::Index> fixed_pipeline_output_attributes_used; + std::array<bool, Maxwell::NumClipDistances> clip_distances{}; u64 local_memory_size; }; class GLSLGenerator { public: GLSLGenerator(const std::set<Subroutine>& subroutines, const ProgramCode& program_code, - u32 main_offset, Maxwell3D::Regs::ShaderStage stage, const std::string& suffix) + u32 main_offset, Maxwell3D::Regs::ShaderStage stage, const std::string& suffix, + std::size_t shader_length) : subroutines(subroutines), program_code(program_code), main_offset(main_offset), - stage(stage), suffix(suffix) { + stage(stage), suffix(suffix), shader_length(shader_length) { std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); local_memory_size = header.GetLocalMemorySize(); regs.SetLocalMemory(local_memory_size); @@ -961,7 +1071,8 @@ public: /// Returns entries in the shader that are useful for external functions ShaderEntries GetEntries() const { - return {regs.GetConstBuffersDeclarations(), regs.GetSamplers()}; + return {regs.GetConstBuffersDeclarations(), regs.GetSamplers(), regs.GetClipDistances(), + shader_length}; } private: @@ -1066,19 +1177,26 @@ private: const std::string& op_a, const std::string& op_b) const { using Tegra::Shader::PredCondition; static const std::unordered_map<PredCondition, const char*> PredicateComparisonStrings = { - {PredCondition::LessThan, "<"}, {PredCondition::Equal, "=="}, - {PredCondition::LessEqual, "<="}, {PredCondition::GreaterThan, ">"}, - {PredCondition::NotEqual, "!="}, {PredCondition::GreaterEqual, ">="}, - {PredCondition::LessThanWithNan, "<"}, {PredCondition::NotEqualWithNan, "!="}, - {PredCondition::GreaterThanWithNan, ">"}, {PredCondition::GreaterEqualWithNan, ">="}}; + {PredCondition::LessThan, "<"}, + {PredCondition::Equal, "=="}, + {PredCondition::LessEqual, "<="}, + {PredCondition::GreaterThan, ">"}, + {PredCondition::NotEqual, "!="}, + {PredCondition::GreaterEqual, ">="}, + {PredCondition::LessThanWithNan, "<"}, + {PredCondition::NotEqualWithNan, "!="}, + {PredCondition::LessEqualWithNan, "<="}, + {PredCondition::GreaterThanWithNan, ">"}, + {PredCondition::GreaterEqualWithNan, ">="}}; const auto& comparison{PredicateComparisonStrings.find(condition)}; - ASSERT_MSG(comparison != PredicateComparisonStrings.end(), - "Unknown predicate comparison operation"); + UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonStrings.end(), + "Unknown predicate comparison operation"); std::string predicate{'(' + op_a + ") " + comparison->second + " (" + op_b + ')'}; if (condition == PredCondition::LessThanWithNan || condition == PredCondition::NotEqualWithNan || + condition == PredCondition::LessEqualWithNan || condition == PredCondition::GreaterThanWithNan || condition == PredCondition::GreaterEqualWithNan) { predicate += " || isnan(" + op_a + ") || isnan(" + op_b + ')'; @@ -1102,7 +1220,7 @@ private: }; auto op = PredicateOperationStrings.find(operation); - ASSERT_MSG(op != PredicateOperationStrings.end(), "Unknown predicate operation"); + UNIMPLEMENTED_IF_MSG(op == PredicateOperationStrings.end(), "Unknown predicate operation"); return op->second; } @@ -1180,7 +1298,7 @@ private: void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a, const std::string& op_b, Tegra::Shader::PredicateResultMode predicate_mode, - Tegra::Shader::Pred predicate) { + Tegra::Shader::Pred predicate, const bool set_cc) { std::string result{}; switch (logic_op) { case LogicOperation::And: { @@ -1200,12 +1318,11 @@ private: break; } default: - LOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(logic_op)); } if (dest != Tegra::Shader::Register::ZeroIndex) { - regs.SetRegisterToInteger(dest, true, 0, result, 1, 1); + regs.SetRegisterToInteger(dest, true, 0, result, 1, 1, false, set_cc); } using Tegra::Shader::PredicateResultMode; @@ -1219,14 +1336,14 @@ private: SetPredicate(static_cast<u64>(predicate), '(' + result + ") != 0"); break; default: - LOG_CRITICAL(HW_GPU, "Unimplemented predicate result mode: {}", - static_cast<u32>(predicate_mode)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented predicate result mode: {}", + static_cast<u32>(predicate_mode)); } } void WriteLop3Instruction(Register dest, const std::string& op_a, const std::string& op_b, - const std::string& op_c, const std::string& imm_lut) { + const std::string& op_c, const std::string& imm_lut, + const bool set_cc) { if (dest == Tegra::Shader::Register::ZeroIndex) { return; } @@ -1249,17 +1366,10 @@ private: result += ')'; - regs.SetRegisterToInteger(dest, true, 0, result, 1, 1); + regs.SetRegisterToInteger(dest, true, 0, result, 1, 1, false, set_cc); } - void WriteTexsInstruction(const Instruction& instr, const std::string& coord, - const std::string& texture) { - // Add an extra scope and declare the texture coords inside to prevent - // overwriting them in case they are used as outputs of the texs instruction. - shader.AddLine('{'); - ++shader.scope; - shader.AddLine(coord); - + void WriteTexsInstructionFloat(const Instruction& instr, const std::string& texture) { // TEXS has two destination registers and a swizzle. The first two elements in the swizzle // go into gpr0+0 and gpr0+1, and the rest goes into gpr28+0 and gpr28+1 @@ -1271,37 +1381,62 @@ private: if (written_components < 2) { // Write the first two swizzle components to gpr0 and gpr0+1 - regs.SetRegisterToFloat(instr.gpr0, component, texture, 1, 4, false, + regs.SetRegisterToFloat(instr.gpr0, component, texture, 1, 4, false, false, written_components % 2); } else { ASSERT(instr.texs.HasTwoDestinations()); // Write the rest of the swizzle components to gpr28 and gpr28+1 - regs.SetRegisterToFloat(instr.gpr28, component, texture, 1, 4, false, + regs.SetRegisterToFloat(instr.gpr28, component, texture, 1, 4, false, false, written_components % 2); } ++written_components; } + } + + void WriteTexsInstructionHalfFloat(const Instruction& instr, const std::string& texture) { + // TEXS.F16 destionation registers are packed in two registers in pairs (just like any half + // float instruction). + + std::array<std::string, 4> components; + u32 written_components = 0; + + for (u32 component = 0; component < 4; ++component) { + if (!instr.texs.IsComponentEnabled(component)) + continue; + components[written_components++] = texture + GetSwizzle(component); + } + if (written_components == 0) + return; - --shader.scope; - shader.AddLine('}'); + const auto BuildComponent = [&](std::string low, std::string high, bool high_enabled) { + return "vec2(" + low + ", " + (high_enabled ? high : "0") + ')'; + }; + + regs.SetRegisterToHalfFloat( + instr.gpr0, 0, BuildComponent(components[0], components[1], written_components > 1), + Tegra::Shader::HalfMerge::H0_H1, 1, 1); + + if (written_components > 2) { + ASSERT(instr.texs.HasTwoDestinations()); + regs.SetRegisterToHalfFloat( + instr.gpr28, 0, + BuildComponent(components[2], components[3], written_components > 3), + Tegra::Shader::HalfMerge::H0_H1, 1, 1); + } } static u32 TextureCoordinates(Tegra::Shader::TextureType texture_type) { switch (texture_type) { - case Tegra::Shader::TextureType::Texture1D: { + case Tegra::Shader::TextureType::Texture1D: return 1; - } - case Tegra::Shader::TextureType::Texture2D: { + case Tegra::Shader::TextureType::Texture2D: return 2; - } case Tegra::Shader::TextureType::Texture3D: - case Tegra::Shader::TextureType::TextureCube: { + case Tegra::Shader::TextureType::TextureCube: return 3; - } default: - LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", static_cast<u32>(texture_type)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled texture type: {}", static_cast<u32>(texture_type)); return 0; } } @@ -1311,12 +1446,10 @@ private: * top. */ void EmitPushToFlowStack(u32 target) { - shader.AddLine('{'); - ++shader.scope; + const auto scope = shader.Scope(); + shader.AddLine("flow_stack[flow_stack_top] = " + std::to_string(target) + "u;"); shader.AddLine("flow_stack_top++;"); - --shader.scope; - shader.AddLine('}'); } /* @@ -1324,20 +1457,18 @@ private: * popped address and decrementing the stack top. */ void EmitPopFromFlowStack() { - shader.AddLine('{'); - ++shader.scope; + const auto scope = shader.Scope(); + shader.AddLine("flow_stack_top--;"); shader.AddLine("jmp_to = flow_stack[flow_stack_top];"); shader.AddLine("break;"); - --shader.scope; - shader.AddLine('}'); } /// Writes the output values from a fragment shader to the corresponding GLSL output variables. void EmitFragmentOutputsWrite() { ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment); - ASSERT_MSG(header.ps.omap.sample_mask == 0, "Samplemask write is unimplemented"); + UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Samplemask write is unimplemented"); shader.AddLine("if (alpha_test[0] != 0) {"); ++shader.scope; @@ -1403,7 +1534,7 @@ private: case Tegra::Shader::VideoType::Size32: // TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when // this type is used (1 * 1 + 0 == 0x5b800000). Until a better - // explanation is found: assert. + // explanation is found: abort. UNIMPLEMENTED(); return zero; case Tegra::Shader::VideoType::Invalid: @@ -1442,6 +1573,252 @@ private: } } + std::pair<size_t, std::string> ValidateAndGetCoordinateElement( + const Tegra::Shader::TextureType texture_type, const bool depth_compare, + const bool is_array, const bool lod_bias_enabled, size_t max_coords, size_t max_inputs) { + const size_t coord_count = TextureCoordinates(texture_type); + + size_t total_coord_count = coord_count + (is_array ? 1 : 0) + (depth_compare ? 1 : 0); + const size_t total_reg_count = total_coord_count + (lod_bias_enabled ? 1 : 0); + if (total_coord_count > max_coords || total_reg_count > max_inputs) { + UNIMPLEMENTED_MSG("Unsupported Texture operation"); + total_coord_count = std::min(total_coord_count, max_coords); + } + // 1D.DC opengl is using a vec3 but 2nd component is ignored later. + total_coord_count += + (depth_compare && !is_array && texture_type == Tegra::Shader::TextureType::Texture1D) + ? 1 + : 0; + + constexpr std::array<const char*, 5> coord_container{ + {"", "float coord = (", "vec2 coord = vec2(", "vec3 coord = vec3(", + "vec4 coord = vec4("}}; + + return std::pair<size_t, std::string>(coord_count, coord_container[total_coord_count]); + } + + std::string GetTextureCode(const Tegra::Shader::Instruction& instr, + const Tegra::Shader::TextureType texture_type, + const Tegra::Shader::TextureProcessMode process_mode, + const bool depth_compare, const bool is_array, + const size_t bias_offset) { + + if ((texture_type == Tegra::Shader::TextureType::Texture3D && + (is_array || depth_compare)) || + (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && + depth_compare)) { + UNIMPLEMENTED_MSG("This method is not supported."); + } + + const std::string sampler = + GetSampler(instr.sampler, texture_type, is_array, depth_compare); + + const bool lod_needed = process_mode == Tegra::Shader::TextureProcessMode::LZ || + process_mode == Tegra::Shader::TextureProcessMode::LL || + process_mode == Tegra::Shader::TextureProcessMode::LLA; + + // LOD selection (either via bias or explicit textureLod) not supported in GL for + // sampler2DArrayShadow and samplerCubeArrayShadow. + const bool gl_lod_supported = !( + (texture_type == Tegra::Shader::TextureType::Texture2D && is_array && depth_compare) || + (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && depth_compare)); + + const std::string read_method = lod_needed && gl_lod_supported ? "textureLod(" : "texture("; + std::string texture = read_method + sampler + ", coord"; + + UNIMPLEMENTED_IF(process_mode != Tegra::Shader::TextureProcessMode::None && + !gl_lod_supported); + + if (process_mode != Tegra::Shader::TextureProcessMode::None && gl_lod_supported) { + if (process_mode == Tegra::Shader::TextureProcessMode::LZ) { + texture += ", 0.0"; + } else { + // If present, lod or bias are always stored in the register indexed by the + // gpr20 + // field with an offset depending on the usage of the other registers + texture += ',' + regs.GetRegisterAsFloat(instr.gpr20.Value() + bias_offset); + } + } + texture += ")"; + return texture; + } + + std::pair<std::string, std::string> GetTEXCode( + const Instruction& instr, const Tegra::Shader::TextureType texture_type, + const Tegra::Shader::TextureProcessMode process_mode, const bool depth_compare, + const bool is_array) { + const bool lod_bias_enabled = (process_mode != Tegra::Shader::TextureProcessMode::None && + process_mode != Tegra::Shader::TextureProcessMode::LZ); + + const auto [coord_count, coord_dcl] = ValidateAndGetCoordinateElement( + texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5); + // If enabled arrays index is always stored in the gpr8 field + const u64 array_register = instr.gpr8.Value(); + // First coordinate index is the gpr8 or gpr8 + 1 when arrays are used + const u64 coord_register = array_register + (is_array ? 1 : 0); + + std::string coord = coord_dcl; + for (size_t i = 0; i < coord_count;) { + coord += regs.GetRegisterAsFloat(coord_register + i); + ++i; + if (i != coord_count) { + coord += ','; + } + } + // 1D.DC in opengl the 2nd component is ignored. + if (depth_compare && !is_array && texture_type == Tegra::Shader::TextureType::Texture1D) { + coord += ",0.0"; + } + if (is_array) { + coord += ',' + regs.GetRegisterAsInteger(array_register); + } + if (depth_compare) { + // Depth is always stored in the register signaled by gpr20 + // or in the next register if lod or bias are used + const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0); + coord += ',' + regs.GetRegisterAsFloat(depth_register); + } + coord += ");"; + return std::make_pair( + coord, GetTextureCode(instr, texture_type, process_mode, depth_compare, is_array, 0)); + } + + std::pair<std::string, std::string> GetTEXSCode( + const Instruction& instr, const Tegra::Shader::TextureType texture_type, + const Tegra::Shader::TextureProcessMode process_mode, const bool depth_compare, + const bool is_array) { + const bool lod_bias_enabled = (process_mode != Tegra::Shader::TextureProcessMode::None && + process_mode != Tegra::Shader::TextureProcessMode::LZ); + + const auto [coord_count, coord_dcl] = ValidateAndGetCoordinateElement( + texture_type, depth_compare, is_array, lod_bias_enabled, 4, 4); + // If enabled arrays index is always stored in the gpr8 field + const u64 array_register = instr.gpr8.Value(); + // First coordinate index is stored in gpr8 field or (gpr8 + 1) when arrays are used + const u64 coord_register = array_register + (is_array ? 1 : 0); + const u64 last_coord_register = + (is_array || !(lod_bias_enabled || depth_compare) || (coord_count > 2)) + ? static_cast<u64>(instr.gpr20.Value()) + : coord_register + 1; + + std::string coord = coord_dcl; + for (size_t i = 0; i < coord_count; ++i) { + const bool last = (i == (coord_count - 1)) && (coord_count > 1); + coord += regs.GetRegisterAsFloat(last ? last_coord_register : coord_register + i); + if (i < coord_count - 1) { + coord += ','; + } + } + + if (is_array) { + coord += ',' + regs.GetRegisterAsInteger(array_register); + } + if (depth_compare) { + // Depth is always stored in the register signaled by gpr20 + // or in the next register if lod or bias are used + const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0); + coord += ',' + regs.GetRegisterAsFloat(depth_register); + } + coord += ");"; + + return std::make_pair(coord, + GetTextureCode(instr, texture_type, process_mode, depth_compare, + is_array, (coord_count > 2 ? 1 : 0))); + } + + std::pair<std::string, std::string> GetTLD4Code(const Instruction& instr, + const Tegra::Shader::TextureType texture_type, + const bool depth_compare, const bool is_array) { + + const size_t coord_count = TextureCoordinates(texture_type); + const size_t total_coord_count = coord_count + (is_array ? 1 : 0); + const size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0); + + constexpr std::array<const char*, 5> coord_container{ + {"", "", "vec2 coord = vec2(", "vec3 coord = vec3(", "vec4 coord = vec4("}}; + + // If enabled arrays index is always stored in the gpr8 field + const u64 array_register = instr.gpr8.Value(); + // First coordinate index is the gpr8 or gpr8 + 1 when arrays are used + const u64 coord_register = array_register + (is_array ? 1 : 0); + + std::string coord = coord_container[total_coord_count]; + for (size_t i = 0; i < coord_count;) { + coord += regs.GetRegisterAsFloat(coord_register + i); + ++i; + if (i != coord_count) { + coord += ','; + } + } + + if (is_array) { + coord += ',' + regs.GetRegisterAsInteger(array_register); + } + coord += ");"; + + const std::string sampler = + GetSampler(instr.sampler, texture_type, is_array, depth_compare); + + std::string texture = "textureGather(" + sampler + ", coord, "; + if (depth_compare) { + // Depth is always stored in the register signaled by gpr20 + texture += regs.GetRegisterAsFloat(instr.gpr20.Value()) + ')'; + } else { + texture += std::to_string(instr.tld4.component) + ')'; + } + return std::make_pair(coord, texture); + } + + std::pair<std::string, std::string> GetTLDSCode(const Instruction& instr, + const Tegra::Shader::TextureType texture_type, + const bool is_array) { + + const size_t coord_count = TextureCoordinates(texture_type); + const size_t total_coord_count = coord_count + (is_array ? 1 : 0); + const bool lod_enabled = + instr.tlds.GetTextureProcessMode() == Tegra::Shader::TextureProcessMode::LL; + + constexpr std::array<const char*, 4> coord_container{ + {"", "int coords = (", "ivec2 coords = ivec2(", "ivec3 coords = ivec3("}}; + + std::string coord = coord_container[total_coord_count]; + + // If enabled arrays index is always stored in the gpr8 field + const u64 array_register = instr.gpr8.Value(); + + // if is array gpr20 is used + const u64 coord_register = is_array ? instr.gpr20.Value() : instr.gpr8.Value(); + + const u64 last_coord_register = + ((coord_count > 2) || (coord_count == 2 && !lod_enabled)) && !is_array + ? static_cast<u64>(instr.gpr20.Value()) + : coord_register + 1; + + for (size_t i = 0; i < coord_count; ++i) { + const bool last = (i == (coord_count - 1)) && (coord_count > 1); + coord += regs.GetRegisterAsInteger(last ? last_coord_register : coord_register + i); + if (i < coord_count - 1) { + coord += ','; + } + } + if (is_array) { + coord += ',' + regs.GetRegisterAsInteger(array_register); + } + coord += ");"; + + const std::string sampler = GetSampler(instr.sampler, texture_type, is_array, false); + + std::string texture = "texelFetch(" + sampler + ", coords"; + + if (lod_enabled) { + // When lod is used always is in grp20 + texture += ", " + regs.GetRegisterAsInteger(instr.gpr20) + ')'; + } else { + texture += ", 0)"; + } + return std::make_pair(coord, texture); + } + /** * Compiles a single instruction from Tegra to GLSL. * @param offset the offset of the Tegra shader instruction. @@ -1459,21 +1836,20 @@ private: // Decoding failure if (!opcode) { - LOG_CRITICAL(HW_GPU, "Unhandled instruction: {0:x}", instr.value); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled instruction: {0:x}", instr.value); return offset + 1; } shader.AddLine( - fmt::format("// {}: {} (0x{:016x})", offset, opcode->GetName(), instr.value)); + fmt::format("// {}: {} (0x{:016x})", offset, opcode->get().GetName(), instr.value)); using Tegra::Shader::Pred; - ASSERT_MSG(instr.pred.full_pred != Pred::NeverExecute, - "NeverExecute predicate not implemented"); + UNIMPLEMENTED_IF_MSG(instr.pred.full_pred == Pred::NeverExecute, + "NeverExecute predicate not implemented"); // Some instructions (like SSY) don't have a predicate field, they are always // unconditionally executed. - bool can_be_predicated = OpCode::IsPredicatedInstruction(opcode->GetId()); + bool can_be_predicated = OpCode::IsPredicatedInstruction(opcode->get().GetId()); if (can_be_predicated && instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) { shader.AddLine("if (" + @@ -1483,7 +1859,7 @@ private: ++shader.scope; } - switch (opcode->GetType()) { + switch (opcode->get().GetType()) { case OpCode::Type::Arithmetic: { std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); @@ -1500,7 +1876,7 @@ private: } } - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::MOV_C: case OpCode::Id::MOV_R: { // MOV does not have neither 'abs' nor 'neg' bits. @@ -1512,19 +1888,37 @@ private: case OpCode::Id::FMUL_R: case OpCode::Id::FMUL_IMM: { // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit. - ASSERT_MSG(instr.fmul.tab5cb8_2 == 0, "FMUL tab5cb8_2({}) is not implemented", - instr.fmul.tab5cb8_2.Value()); - ASSERT_MSG(instr.fmul.tab5c68_1 == 0, "FMUL tab5cb8_1({}) is not implemented", - instr.fmul.tab5c68_1.Value()); - ASSERT_MSG(instr.fmul.tab5c68_0 == 1, "FMUL tab5cb8_0({}) is not implemented", - instr.fmul.tab5c68_0 - .Value()); // SMO typical sends 1 here which seems to be the default - ASSERT_MSG(instr.fmul.cc == 0, "FMUL cc is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, + "FMUL tab5cb8_2({}) is not implemented", + instr.fmul.tab5cb8_2.Value()); + UNIMPLEMENTED_IF_MSG( + instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented", + instr.fmul.tab5c68_0 + .Value()); // SMO typical sends 1 here which seems to be the default op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b); - regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, - instr.alu.saturate_d, 0, true); + std::string postfactor_op; + if (instr.fmul.postfactor != 0) { + s8 postfactor = static_cast<s8>(instr.fmul.postfactor); + + // postfactor encoded as 3-bit 1's complement in instruction, + // interpreted with below logic. + if (postfactor >= 4) { + postfactor = 7 - postfactor; + } else { + postfactor = 0 - postfactor; + } + + if (postfactor > 0) { + postfactor_op = " * " + std::to_string(1 << postfactor); + } else { + postfactor_op = " / " + std::to_string(1 << -postfactor); + } + } + + regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + postfactor_op, 1, 1, + instr.alu.saturate_d, instr.generates_cc, 0, true); break; } case OpCode::Id::FADD_C: @@ -1534,7 +1928,7 @@ private: op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b); regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, instr.generates_cc, 0, true); break; } case OpCode::Id::MUFU: { @@ -1542,42 +1936,45 @@ private: switch (instr.sub_op) { case SubOp::Cos: regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Sin: regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Ex2: regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Lg2: regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Rcp: regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Rsq: regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; case SubOp::Sqrt: regs.SetRegisterToFloat(instr.gpr0, 0, "sqrt(" + op_a + ')', 1, 1, - instr.alu.saturate_d, 0, true); + instr.alu.saturate_d, false, 0, true); break; default: - LOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}", - static_cast<unsigned>(instr.sub_op.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled MUFU sub op={0:x}", + static_cast<unsigned>(instr.sub_op.Value())); } break; } case OpCode::Id::FMNMX_C: case OpCode::Id::FMNMX_R: case OpCode::Id::FMNMX_IMM: { + UNIMPLEMENTED_IF_MSG( + instr.generates_cc, + "Condition codes generation in FMNMX is partially implemented"); + op_a = GetOperandAbsNeg(op_a, instr.alu.abs_a, instr.alu.negate_a); op_b = GetOperandAbsNeg(op_b, instr.alu.abs_b, instr.alu.negate_b); @@ -1587,7 +1984,7 @@ private: regs.SetRegisterToFloat(instr.gpr0, 0, '(' + condition + ") ? min(" + parameters + ") : max(" + parameters + ')', - 1, 1, false, 0, true); + 1, 1, false, instr.generates_cc, 0, true); break; } case OpCode::Id::RRO_C: @@ -1600,26 +1997,29 @@ private: break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled arithmetic instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled arithmetic instruction: {}", opcode->get().GetName()); } } break; } case OpCode::Type::ArithmeticImmediate: { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::MOV32_IMM: { regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1); break; } case OpCode::Id::FMUL32_IMM: { - regs.SetRegisterToFloat(instr.gpr0, 0, - regs.GetRegisterAsFloat(instr.gpr8) + " * " + - GetImmediate32(instr), - 1, 1, instr.fmul32.saturate, 0, true); + regs.SetRegisterToFloat( + instr.gpr0, 0, + regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1, + instr.fmul32.saturate, instr.op_32.generates_cc, 0, true); break; } case OpCode::Id::FADD32I: { + UNIMPLEMENTED_IF_MSG( + instr.op_32.generates_cc, + "Condition codes generation in FADD32I is partially implemented"); + std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); std::string op_b = GetImmediate32(instr); @@ -1639,19 +2039,20 @@ private: op_b = "-(" + op_b + ')'; } - regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, false, 0, true); + regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, false, + instr.op_32.generates_cc, 0, true); break; } } break; } case OpCode::Type::Bfe: { - ASSERT_MSG(!instr.bfe.negate_b, "Unimplemented"); + UNIMPLEMENTED_IF(instr.bfe.negate_b); std::string op_a = instr.bfe.negate_a ? "-" : ""; op_a += regs.GetRegisterAsInteger(instr.gpr8); - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::BFE_IMM: { std::string inner_shift = '(' + op_a + " << " + std::to_string(instr.bfe.GetLeftShiftValue()) + ')'; @@ -1659,17 +2060,38 @@ private: '(' + inner_shift + " >> " + std::to_string(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position) + ')'; - regs.SetRegisterToInteger(instr.gpr0, true, 0, outer_shift, 1, 1); + regs.SetRegisterToInteger(instr.gpr0, true, 0, outer_shift, 1, 1, false, + instr.generates_cc); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled BFE instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); } } break; } + case OpCode::Type::Bfi: { + const auto [base, packed_shift] = [&]() -> std::tuple<std::string, std::string> { + switch (opcode->get().GetId()) { + case OpCode::Id::BFI_IMM_R: + return {regs.GetRegisterAsInteger(instr.gpr39, 0, false), + std::to_string(instr.alu.GetSignedImm20_20())}; + default: + UNREACHABLE(); + return {regs.GetRegisterAsInteger(instr.gpr39, 0, false), + std::to_string(instr.alu.GetSignedImm20_20())}; + } + }(); + const std::string offset = '(' + packed_shift + " & 0xff)"; + const std::string bits = "((" + packed_shift + " >> 8) & 0xff)"; + const std::string insert = regs.GetRegisterAsInteger(instr.gpr8, 0, false); + regs.SetRegisterToInteger(instr.gpr0, false, 0, + "bitfieldInsert(" + base + ", " + insert + ", " + offset + + ", " + bits + ')', + 1, 1, false, instr.generates_cc); + break; + } case OpCode::Type::Shift: { std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true); std::string op_b; @@ -1685,7 +2107,7 @@ private: } } - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::SHR_C: case OpCode::Id::SHR_R: case OpCode::Id::SHR_IMM: { @@ -1696,17 +2118,19 @@ private: // Cast to int is superfluous for arithmetic shift, it's only for a logical shift regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(" + op_a + " >> " + op_b + ')', - 1, 1); + 1, 1, false, instr.generates_cc); break; } case OpCode::Id::SHL_C: case OpCode::Id::SHL_R: case OpCode::Id::SHL_IMM: - regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " << " + op_b, 1, 1); + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in SHL is not implemented"); + regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " << " + op_b, 1, 1, false, + instr.generates_cc); break; default: { - LOG_CRITICAL(HW_GPU, "Unhandled shift instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled shift instruction: {}", opcode->get().GetName()); } } break; @@ -1715,15 +2139,20 @@ private: std::string op_a = regs.GetRegisterAsInteger(instr.gpr8); std::string op_b = std::to_string(instr.alu.imm20_32.Value()); - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::IADD32I: + UNIMPLEMENTED_IF_MSG( + instr.op_32.generates_cc, + "Condition codes generation in IADD32I is partially implemented"); + if (instr.iadd32i.negate_a) op_a = "-(" + op_a + ')'; regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1, - instr.iadd32i.saturate != 0); + instr.iadd32i.saturate, instr.op_32.generates_cc); break; case OpCode::Id::LOP32I: { + if (instr.alu.lop32i.invert_a) op_a = "~(" + op_a + ')'; @@ -1732,13 +2161,12 @@ private: WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b, Tegra::Shader::PredicateResultMode::None, - Tegra::Shader::Pred::UnusedIndex); + Tegra::Shader::Pred::UnusedIndex, instr.op_32.generates_cc); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticIntegerImmediate instruction: {}", - opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled ArithmeticIntegerImmediate instruction: {}", + opcode->get().GetName()); } } break; @@ -1757,10 +2185,13 @@ private: } } - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::IADD_C: case OpCode::Id::IADD_R: case OpCode::Id::IADD_IMM: { + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in IADD is partially implemented"); + if (instr.alu_integer.negate_a) op_a = "-(" + op_a + ')'; @@ -1768,12 +2199,16 @@ private: op_b = "-(" + op_b + ')'; regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1, - instr.alu.saturate_d); + instr.alu.saturate_d, instr.generates_cc); break; } case OpCode::Id::IADD3_C: case OpCode::Id::IADD3_R: case OpCode::Id::IADD3_IMM: { + UNIMPLEMENTED_IF_MSG( + instr.generates_cc, + "Condition codes generation in IADD3 is partially implemented"); + std::string op_c = regs.GetRegisterAsInteger(instr.gpr39); auto apply_height = [](auto height, auto& oprand) { @@ -1787,13 +2222,12 @@ private: oprand = "((" + oprand + ") >> 16)"; break; default: - LOG_CRITICAL(HW_GPU, "Unhandled IADD3 height: {}", - static_cast<u32>(height.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled IADD3 height: {}", + static_cast<u32>(height.Value())); } }; - if (opcode->GetId() == OpCode::Id::IADD3_R) { + if (opcode->get().GetId() == OpCode::Id::IADD3_R) { apply_height(instr.iadd3.height_a, op_a); apply_height(instr.iadd3.height_b, op_b); apply_height(instr.iadd3.height_c, op_c); @@ -1809,7 +2243,7 @@ private: op_c = "-(" + op_c + ')'; std::string result; - if (opcode->GetId() == OpCode::Id::IADD3_R) { + if (opcode->get().GetId() == OpCode::Id::IADD3_R) { switch (instr.iadd3.mode) { case Tegra::Shader::IAdd3Mode::RightShift: // TODO(tech4me): According to @@ -1829,12 +2263,17 @@ private: result = '(' + op_a + " + " + op_b + " + " + op_c + ')'; } - regs.SetRegisterToInteger(instr.gpr0, true, 0, result, 1, 1); + regs.SetRegisterToInteger(instr.gpr0, true, 0, result, 1, 1, false, + instr.generates_cc); break; } case OpCode::Id::ISCADD_C: case OpCode::Id::ISCADD_R: case OpCode::Id::ISCADD_IMM: { + UNIMPLEMENTED_IF_MSG( + instr.generates_cc, + "Condition codes generation in ISCADD is partially implemented"); + if (instr.alu_integer.negate_a) op_a = "-(" + op_a + ')'; @@ -1844,7 +2283,8 @@ private: const std::string shift = std::to_string(instr.alu_integer.shift_amount.Value()); regs.SetRegisterToInteger(instr.gpr0, true, 0, - "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1); + "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1, + false, instr.generates_cc); break; } case OpCode::Id::POPC_C: @@ -1868,6 +2308,7 @@ private: case OpCode::Id::LOP_C: case OpCode::Id::LOP_R: case OpCode::Id::LOP_IMM: { + if (instr.alu.lop.invert_a) op_a = "~(" + op_a + ')'; @@ -1875,7 +2316,8 @@ private: op_b = "~(" + op_b + ')'; WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b, - instr.alu.lop.pred_result_mode, instr.alu.lop.pred48); + instr.alu.lop.pred_result_mode, instr.alu.lop.pred48, + instr.generates_cc); break; } case OpCode::Id::LOP3_C: @@ -1884,27 +2326,30 @@ private: const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39); std::string lut; - if (opcode->GetId() == OpCode::Id::LOP3_R) { + if (opcode->get().GetId() == OpCode::Id::LOP3_R) { lut = '(' + std::to_string(instr.alu.lop3.GetImmLut28()) + ')'; } else { lut = '(' + std::to_string(instr.alu.lop3.GetImmLut48()) + ')'; } - WriteLop3Instruction(instr.gpr0, op_a, op_b, op_c, lut); + WriteLop3Instruction(instr.gpr0, op_a, op_b, op_c, lut, instr.generates_cc); break; } case OpCode::Id::IMNMX_C: case OpCode::Id::IMNMX_R: case OpCode::Id::IMNMX_IMM: { - ASSERT_MSG(instr.imnmx.exchange == Tegra::Shader::IMinMaxExchange::None, - "Unimplemented"); + UNIMPLEMENTED_IF(instr.imnmx.exchange != Tegra::Shader::IMinMaxExchange::None); + UNIMPLEMENTED_IF_MSG( + instr.generates_cc, + "Condition codes generation in IMNMX is partially implemented"); + const std::string condition = GetPredicateCondition(instr.imnmx.pred, instr.imnmx.negate_pred != 0); const std::string parameters = op_a + ',' + op_b; regs.SetRegisterToInteger(instr.gpr0, instr.imnmx.is_signed, 0, '(' + condition + ") ? min(" + parameters + ") : max(" + parameters + ')', - 1, 1); + 1, 1, false, instr.generates_cc); break; } case OpCode::Id::LEA_R2: @@ -1914,7 +2359,7 @@ private: case OpCode::Id::LEA_HI: { std::string op_c; - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::LEA_R2: { op_a = regs.GetRegisterAsInteger(instr.gpr20); op_b = regs.GetRegisterAsInteger(instr.gpr39); @@ -1959,43 +2404,41 @@ private: op_b = regs.GetRegisterAsInteger(instr.gpr8); op_a = std::to_string(instr.lea.imm.entry_a); op_c = std::to_string(instr.lea.imm.entry_b); - LOG_CRITICAL(HW_GPU, "Unhandled LEA subinstruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled LEA subinstruction: {}", opcode->get().GetName()); } } - if (instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex)) { - LOG_ERROR(HW_GPU, "Unhandled LEA Predicate"); - UNREACHABLE(); - } + UNIMPLEMENTED_IF_MSG(instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex), + "Unhandled LEA Predicate"); const std::string value = '(' + op_a + " + (" + op_b + "*(1 << " + op_c + ")))"; - regs.SetRegisterToInteger(instr.gpr0, true, 0, value, 1, 1); + regs.SetRegisterToInteger(instr.gpr0, true, 0, value, 1, 1, false, + instr.generates_cc); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}", - opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled ArithmeticInteger instruction: {}", + opcode->get().GetName()); } } break; } case OpCode::Type::ArithmeticHalf: { - if (opcode->GetId() == OpCode::Id::HADD2_C || opcode->GetId() == OpCode::Id::HADD2_R) { - ASSERT_MSG(instr.alu_half.ftz == 0, "Unimplemented"); + if (opcode->get().GetId() == OpCode::Id::HADD2_C || + opcode->get().GetId() == OpCode::Id::HADD2_R) { + UNIMPLEMENTED_IF(instr.alu_half.ftz != 0); } const bool negate_a = - opcode->GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0; + opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0; const bool negate_b = - opcode->GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; + opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; const std::string op_a = GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr8, 0, false), instr.alu_half.type_a, instr.alu_half.abs_a != 0, negate_a); std::string op_b; - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HMUL2_C: op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset, @@ -2013,7 +2456,7 @@ private: op_b = GetHalfFloat(op_b, instr.alu_half.type_b, instr.alu_half.abs_b != 0, negate_b); const std::string result = [&]() { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HADD2_R: return '(' + op_a + " + " + op_b + ')'; @@ -2021,8 +2464,8 @@ private: case OpCode::Id::HMUL2_R: return '(' + op_a + " * " + op_b + ')'; default: - LOG_CRITICAL(HW_GPU, "Unhandled half float instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", + opcode->get().GetName()); return std::string("0"); } }(); @@ -2032,11 +2475,11 @@ private: break; } case OpCode::Type::ArithmeticHalfImmediate: { - if (opcode->GetId() == OpCode::Id::HADD2_IMM) { - ASSERT_MSG(instr.alu_half_imm.ftz == 0, "Unimplemented"); + if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) { + UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0); } else { - ASSERT_MSG(instr.alu_half_imm.precision == Tegra::Shader::HalfPrecision::None, - "Unimplemented"); + UNIMPLEMENTED_IF(instr.alu_half_imm.precision != + Tegra::Shader::HalfPrecision::None); } const std::string op_a = GetHalfFloat( @@ -2046,7 +2489,7 @@ private: const std::string op_b = UnpackHalfImmediate(instr, true); const std::string result = [&]() { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HADD2_IMM: return op_a + " + " + op_b; case OpCode::Id::HMUL2_IMM: @@ -2066,13 +2509,16 @@ private: std::string op_b = instr.ffma.negate_b ? "-" : ""; std::string op_c = instr.ffma.negate_c ? "-" : ""; - ASSERT_MSG(instr.ffma.cc == 0, "FFMA cc not implemented"); - ASSERT_MSG(instr.ffma.tab5980_0 == 1, "FFMA tab5980_0({}) not implemented", - instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO - ASSERT_MSG(instr.ffma.tab5980_1 == 0, "FFMA tab5980_1({}) not implemented", - instr.ffma.tab5980_1.Value()); + UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented"); + UNIMPLEMENTED_IF_MSG( + instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented", + instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO + UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented", + instr.ffma.tab5980_1.Value()); + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in FFMA is partially implemented"); - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::FFMA_CR: { op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset, GLSLRegister::Type::Float); @@ -2096,25 +2542,21 @@ private: break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled FFMA instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled FFMA instruction: {}", opcode->get().GetName()); } } regs.SetRegisterToFloat(instr.gpr0, 0, "fma(" + op_a + ", " + op_b + ", " + op_c + ')', - 1, 1, instr.alu.saturate_d, 0, true); - + 1, 1, instr.alu.saturate_d, instr.generates_cc, 0, true); break; } case OpCode::Type::Hfma2: { - if (opcode->GetId() == OpCode::Id::HFMA2_RR) { - ASSERT_MSG(instr.hfma2.rr.precision == Tegra::Shader::HalfPrecision::None, - "Unimplemented"); + if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) { + UNIMPLEMENTED_IF(instr.hfma2.rr.precision != Tegra::Shader::HalfPrecision::None); } else { - ASSERT_MSG(instr.hfma2.precision == Tegra::Shader::HalfPrecision::None, - "Unimplemented"); + UNIMPLEMENTED_IF(instr.hfma2.precision != Tegra::Shader::HalfPrecision::None); } - const bool saturate = opcode->GetId() == OpCode::Id::HFMA2_RR + const bool saturate = opcode->get().GetId() == OpCode::Id::HFMA2_RR ? instr.hfma2.rr.saturate != 0 : instr.hfma2.saturate != 0; @@ -2122,7 +2564,7 @@ private: GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr8, 0, false), instr.hfma2.type_a); std::string op_b, op_c; - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HFMA2_CR: op_b = GetHalfFloat(regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset, GLSLRegister::Type::UnsignedInteger), @@ -2160,9 +2602,9 @@ private: break; } case OpCode::Type::Conversion: { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::I2I_R: { - ASSERT_MSG(!instr.conversion.selector, "Unimplemented"); + UNIMPLEMENTED_IF(instr.conversion.selector); std::string op_a = regs.GetRegisterAsInteger( instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size); @@ -2176,16 +2618,15 @@ private: } regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1, - 1, instr.alu.saturate_d, 0, instr.conversion.dest_size, - instr.generates_cc.Value() != 0); + 1, instr.alu.saturate_d, instr.generates_cc, 0, + instr.conversion.dest_size); break; } case OpCode::Id::I2F_R: case OpCode::Id::I2F_C: { - ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented"); - ASSERT_MSG(!instr.conversion.selector, "Unimplemented"); - - std::string op_a{}; + UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.conversion.selector); + std::string op_a; if (instr.is_b_gpr) { op_a = @@ -2207,12 +2648,12 @@ private: op_a = "-(" + op_a + ')'; } - regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1); + regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, false, instr.generates_cc); break; } case OpCode::Id::F2F_R: { - ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented"); - ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented"); + UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); std::string op_a = regs.GetRegisterAsFloat(instr.gpr20); if (instr.conversion.abs_a) { @@ -2239,18 +2680,18 @@ private: op_a = "trunc(" + op_a + ')'; break; default: - LOG_CRITICAL(HW_GPU, "Unimplemented f2f rounding mode {}", - static_cast<u32>(instr.conversion.f2f.rounding.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", + static_cast<u32>(instr.conversion.f2f.rounding.Value())); break; } - regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d); + regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d, + instr.generates_cc); break; } case OpCode::Id::F2I_R: case OpCode::Id::F2I_C: { - ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented"); + UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); std::string op_a{}; if (instr.is_b_gpr) { @@ -2281,9 +2722,8 @@ private: op_a = "trunc(" + op_a + ')'; break; default: - LOG_CRITICAL(HW_GPU, "Unimplemented f2i rounding mode {}", - static_cast<u32>(instr.conversion.f2i.rounding.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unimplemented F2I rounding mode {}", + static_cast<u32>(instr.conversion.f2i.rounding.Value())); break; } @@ -2294,24 +2734,24 @@ private: } regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1, - 1, false, 0, instr.conversion.dest_size); + 1, false, instr.generates_cc, 0, + instr.conversion.dest_size); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled conversion instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled conversion instruction: {}", opcode->get().GetName()); } } break; } case OpCode::Type::Memory: { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::LD_A: { // Note: Shouldn't this be interp mode flat? As in no interpolation made. - ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex, - "Indirect attribute loads are not supported"); - ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0, - "Unaligned attribute loads are not supported"); + UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex, + "Indirect attribute loads are not supported"); + UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0, + "Unaligned attribute loads are not supported"); Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Perspective, Tegra::Shader::IpaSampleMode::Default}; @@ -2338,12 +2778,9 @@ private: break; } case OpCode::Id::LD_C: { - ASSERT_MSG(instr.ld_c.unknown == 0, "Unimplemented"); + UNIMPLEMENTED_IF(instr.ld_c.unknown != 0); - // Add an extra scope and declare the index register inside to prevent - // overwriting it in case it is used as an output of the LD instruction. - shader.AddLine("{"); - ++shader.scope; + const auto scope = shader.Scope(); shader.AddLine("uint index = (" + regs.GetRegisterAsInteger(instr.gpr8, 0, false) + " / 4) & (MAX_CONSTBUFFER_ELEMENTS - 1);"); @@ -2366,20 +2803,16 @@ private: break; } default: - LOG_CRITICAL(HW_GPU, "Unhandled type: {}", - static_cast<unsigned>(instr.ld_c.type.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled type: {}", + static_cast<unsigned>(instr.ld_c.type.Value())); } - - --shader.scope; - shader.AddLine("}"); break; } case OpCode::Id::LD_L: { - // Add an extra scope and declare the index register inside to prevent - // overwriting it in case it is used as an output of the LD instruction. - shader.AddLine('{'); - ++shader.scope; + UNIMPLEMENTED_IF_MSG(instr.ld_l.unknown == 1, "LD_L Unhandled mode: {}", + static_cast<unsigned>(instr.ld_l.unknown.Value())); + + const auto scope = shader.Scope(); std::string op = '(' + regs.GetRegisterAsInteger(instr.gpr8, 0, false) + " + " + std::to_string(instr.smem_imm.Value()) + ')'; @@ -2388,31 +2821,21 @@ private: const std::string op_a = regs.GetLocalMemoryAsFloat("index"); - if (instr.ld_l.unknown != 1) { - LOG_CRITICAL(HW_GPU, "LD_L Unhandled mode: {}", - static_cast<unsigned>(instr.ld_l.unknown.Value())); - UNREACHABLE(); - } - switch (instr.ldst_sl.type.Value()) { case Tegra::Shader::StoreType::Bytes32: regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1); break; default: - LOG_CRITICAL(HW_GPU, "LD_L Unhandled type: {}", - static_cast<unsigned>(instr.ldst_sl.type.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("LD_L Unhandled type: {}", + static_cast<unsigned>(instr.ldst_sl.type.Value())); } - - --shader.scope; - shader.AddLine('}'); break; } case OpCode::Id::ST_A: { - ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex, - "Indirect attribute loads are not supported"); - ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0, - "Unaligned attribute loads are not supported"); + UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex, + "Indirect attribute loads are not supported"); + UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0, + "Unaligned attribute loads are not supported"); u64 next_element = instr.attribute.fmt20.element; u64 next_index = static_cast<u64>(instr.attribute.fmt20.index.Value()); @@ -2437,472 +2860,218 @@ private: break; } case OpCode::Id::ST_L: { - // Add an extra scope and declare the index register inside to prevent - // overwriting it in case it is used as an output of the LD instruction. - shader.AddLine('{'); - ++shader.scope; + UNIMPLEMENTED_IF_MSG(instr.st_l.unknown == 0, "ST_L Unhandled mode: {}", + static_cast<unsigned>(instr.st_l.unknown.Value())); + + const auto scope = shader.Scope(); std::string op = '(' + regs.GetRegisterAsInteger(instr.gpr8, 0, false) + " + " + std::to_string(instr.smem_imm.Value()) + ')'; shader.AddLine("uint index = (" + op + " / 4);"); - if (instr.st_l.unknown != 0) { - LOG_CRITICAL(HW_GPU, "ST_L Unhandled mode: {}", - static_cast<unsigned>(instr.st_l.unknown.Value())); - UNREACHABLE(); - } - switch (instr.ldst_sl.type.Value()) { case Tegra::Shader::StoreType::Bytes32: regs.SetLocalMemoryAsFloat("index", regs.GetRegisterAsFloat(instr.gpr0)); break; default: - LOG_CRITICAL(HW_GPU, "ST_L Unhandled type: {}", - static_cast<unsigned>(instr.ldst_sl.type.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("ST_L Unhandled type: {}", + static_cast<unsigned>(instr.ldst_sl.type.Value())); } - - --shader.scope; - shader.AddLine('}'); break; } case OpCode::Id::TEX: { Tegra::Shader::TextureType texture_type{instr.tex.texture_type}; - std::string coord; const bool is_array = instr.tex.array != 0; - - ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), - "AOFFI is not implemented"); - const bool depth_compare = instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC); - u32 num_coordinates = TextureCoordinates(texture_type); - if (depth_compare) - num_coordinates += 1; - - switch (num_coordinates) { - case 1: { - if (is_array) { - const std::string index = regs.GetRegisterAsInteger(instr.gpr8); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + index + ");"; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - coord = "float coords = " + x + ';'; - } - break; - } - case 2: { - if (is_array) { - const std::string index = regs.GetRegisterAsInteger(instr.gpr8); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");"; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - } - break; - } - case 3: { - if (depth_compare) { - if (is_array) { - const std::string index = regs.GetRegisterAsInteger(instr.gpr8); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string y = regs.GetRegisterAsFloat(instr.gpr20); - const std::string z = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); - coord = "vec4 coords = vec4(" + x + ", " + y + ", " + z + ", " + index + - ");"; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string z = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; - } - } else { - if (is_array) { - const std::string index = regs.GetRegisterAsInteger(instr.gpr8); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); - const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 3); - coord = "vec4 coords = vec4(" + x + ", " + y + ", " + z + ", " + index + - ");"; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; - } - } - break; - } - default: - LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}", - static_cast<u32>(num_coordinates)); - UNREACHABLE(); + const auto process_mode = instr.tex.GetTextureProcessMode(); + UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); - // Fallback to interpreting as a 2D texture for now - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - texture_type = Tegra::Shader::TextureType::Texture2D; - } - // TODO: make sure coordinates are always indexed to gpr8 and gpr20 is always bias - // or lod. - std::string op_c; - - const std::string sampler = - GetSampler(instr.sampler, texture_type, is_array, depth_compare); - // Add an extra scope and declare the texture coords inside to prevent - // overwriting them in case they are used as outputs of the texs instruction. + const auto [coord, texture] = + GetTEXCode(instr, texture_type, process_mode, depth_compare, is_array); - shader.AddLine("{"); - ++shader.scope; + const auto scope = shader.Scope(); shader.AddLine(coord); - std::string texture; - switch (instr.tex.GetTextureProcessMode()) { - case Tegra::Shader::TextureProcessMode::None: { - texture = "texture(" + sampler + ", coords)"; - break; - } - case Tegra::Shader::TextureProcessMode::LZ: { - texture = "textureLod(" + sampler + ", coords, 0.0)"; - break; - } - case Tegra::Shader::TextureProcessMode::LB: - case Tegra::Shader::TextureProcessMode::LBA: { - if (depth_compare) { - if (is_array) - op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 2); - else - op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); - } else { - op_c = regs.GetRegisterAsFloat(instr.gpr20); - } - // TODO: Figure if A suffix changes the equation at all. - texture = "texture(" + sampler + ", coords, " + op_c + ')'; - break; - } - case Tegra::Shader::TextureProcessMode::LL: - case Tegra::Shader::TextureProcessMode::LLA: { - if (num_coordinates <= 2) { - op_c = regs.GetRegisterAsFloat(instr.gpr20); - } else { - op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); - } - // TODO: Figure if A suffix changes the equation at all. - texture = "textureLod(" + sampler + ", coords, " + op_c + ')'; - break; - } - default: { - texture = "texture(" + sampler + ", coords)"; - LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}", - static_cast<u32>(instr.tex.GetTextureProcessMode())); - UNREACHABLE(); - } - } - if (!depth_compare) { + if (depth_compare) { + regs.SetRegisterToFloat(instr.gpr0, 0, texture, 1, 1); + } else { + shader.AddLine("vec4 texture_tmp = " + texture + ';'); std::size_t dest_elem{}; for (std::size_t elem = 0; elem < 4; ++elem) { if (!instr.tex.IsComponentEnabled(elem)) { // Skip disabled components continue; } - regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem); + regs.SetRegisterToFloat(instr.gpr0, elem, "texture_tmp", 1, 4, false, false, + dest_elem); ++dest_elem; } - } else { - regs.SetRegisterToFloat(instr.gpr0, 0, texture, 1, 1, false); } - --shader.scope; - shader.AddLine("}"); break; } case OpCode::Id::TEXS: { - std::string coord; Tegra::Shader::TextureType texture_type{instr.texs.GetTextureType()}; - bool is_array{instr.texs.IsArrayTexture()}; - - ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - + const bool is_array{instr.texs.IsArrayTexture()}; const bool depth_compare = instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC); - u32 num_coordinates = TextureCoordinates(texture_type); - if (depth_compare) - num_coordinates += 1; - - switch (num_coordinates) { - case 2: { - if (is_array) { - const std::string index = regs.GetRegisterAsInteger(instr.gpr8); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string y = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");"; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - } - break; - } - case 3: { - if (is_array) { - UNIMPLEMENTED_MSG("3-coordinate arrays not fully implemented"); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - texture_type = Tegra::Shader::TextureType::Texture2D; - is_array = false; - } else { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string z = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; - } - break; - } - default: - LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}", - static_cast<u32>(num_coordinates)); - UNREACHABLE(); + const auto process_mode = instr.texs.GetTextureProcessMode(); - // Fallback to interpreting as a 2D texture for now - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr20); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - texture_type = Tegra::Shader::TextureType::Texture2D; - is_array = false; - } - const std::string sampler = - GetSampler(instr.sampler, texture_type, is_array, depth_compare); - std::string texture; - switch (instr.texs.GetTextureProcessMode()) { - case Tegra::Shader::TextureProcessMode::None: { - texture = "texture(" + sampler + ", coords)"; - break; - } - case Tegra::Shader::TextureProcessMode::LZ: { - texture = "textureLod(" + sampler + ", coords, 0.0)"; - break; - } - case Tegra::Shader::TextureProcessMode::LL: { - const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1); - texture = "textureLod(" + sampler + ", coords, " + op_c + ')'; - break; - } - default: { - texture = "texture(" + sampler + ", coords)"; - LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}", - static_cast<u32>(instr.texs.GetTextureProcessMode())); - UNREACHABLE(); - } + UNIMPLEMENTED_IF_MSG(instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + + const auto scope = shader.Scope(); + + auto [coord, texture] = + GetTEXSCode(instr, texture_type, process_mode, depth_compare, is_array); + + shader.AddLine(coord); + + if (depth_compare) { + texture = "vec4(" + texture + ')'; } - if (!depth_compare) { - WriteTexsInstruction(instr, coord, texture); + shader.AddLine("vec4 texture_tmp = " + texture + ';'); + + if (instr.texs.fp32_flag) { + WriteTexsInstructionFloat(instr, "texture_tmp"); } else { - WriteTexsInstruction(instr, coord, "vec4(" + texture + ')'); + WriteTexsInstructionHalfFloat(instr, "texture_tmp"); } break; } case OpCode::Id::TLDS: { - std::string coord; const Tegra::Shader::TextureType texture_type{instr.tlds.GetTextureType()}; const bool is_array{instr.tlds.IsArrayTexture()}; - ASSERT(texture_type == Tegra::Shader::TextureType::Texture2D); - ASSERT(is_array == false); + UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::MZ), + "MZ is not implemented"); - ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), - "AOFFI is not implemented"); - ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::MZ), - "MZ is not implemented"); + const auto [coord, texture] = GetTLDSCode(instr, texture_type, is_array); - u32 op_c_offset = 0; + const auto scope = shader.Scope(); - switch (texture_type) { - case Tegra::Shader::TextureType::Texture1D: { - const std::string x = regs.GetRegisterAsInteger(instr.gpr8); - coord = "int coords = " + x + ';'; - break; - } - case Tegra::Shader::TextureType::Texture2D: { - if (is_array) { - LOG_CRITICAL(HW_GPU, "Unhandled 2d array texture"); - UNREACHABLE(); - } else { - const std::string x = regs.GetRegisterAsInteger(instr.gpr8); - const std::string y = regs.GetRegisterAsInteger(instr.gpr20); - coord = "ivec2 coords = ivec2(" + x + ", " + y + ");"; - op_c_offset = 1; - } - break; - } - default: - LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", - static_cast<u32>(texture_type)); - UNREACHABLE(); - } - const std::string sampler = - GetSampler(instr.sampler, texture_type, is_array, false); - std::string texture = "texelFetch(" + sampler + ", coords, 0)"; - switch (instr.tlds.GetTextureProcessMode()) { - case Tegra::Shader::TextureProcessMode::LZ: { - texture = "texelFetch(" + sampler + ", coords, 0)"; - break; - } - case Tegra::Shader::TextureProcessMode::LL: { - const std::string op_c = - regs.GetRegisterAsInteger(instr.gpr20.Value() + op_c_offset); - texture = "texelFetch(" + sampler + ", coords, " + op_c + ')'; - break; - } - default: { - texture = "texelFetch(" + sampler + ", coords, 0)"; - LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}", - static_cast<u32>(instr.tlds.GetTextureProcessMode())); - UNREACHABLE(); - } - } - WriteTexsInstruction(instr, coord, texture); + shader.AddLine(coord); + shader.AddLine("vec4 texture_tmp = " + texture + ';'); + WriteTexsInstructionFloat(instr, "texture_tmp"); break; } case OpCode::Id::TLD4: { - ASSERT(instr.tld4.texture_type == Tegra::Shader::TextureType::Texture2D); - ASSERT(instr.tld4.array == 0); - std::string coord; - - ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), - "AOFFI is not implemented"); - ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), - "NDV is not implemented"); - ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::PTP), - "PTP is not implemented"); + + UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), + "NDV is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::PTP), + "PTP is not implemented"); + + auto texture_type = instr.tld4.texture_type.Value(); const bool depth_compare = instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC); - auto texture_type = instr.tld4.texture_type.Value(); - u32 num_coordinates = TextureCoordinates(texture_type); - if (depth_compare) - num_coordinates += 1; + const bool is_array = instr.tld4.array != 0; - switch (num_coordinates) { - case 2: { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - break; - } - case 3: { - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2); - coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");"; - break; - } - default: - LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}", - static_cast<u32>(num_coordinates)); - UNREACHABLE(); - const std::string x = regs.GetRegisterAsFloat(instr.gpr8); - const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; - texture_type = Tegra::Shader::TextureType::Texture2D; - } + const auto [coord, texture] = + GetTLD4Code(instr, texture_type, depth_compare, is_array); + + const auto scope = shader.Scope(); - const std::string sampler = - GetSampler(instr.sampler, texture_type, false, depth_compare); - // Add an extra scope and declare the texture coords inside to prevent - // overwriting them in case they are used as outputs of the texs instruction. - shader.AddLine("{"); - ++shader.scope; shader.AddLine(coord); - const std::string texture = "textureGather(" + sampler + ", coords, " + - std::to_string(instr.tld4.component) + ')'; - if (!depth_compare) { - std::size_t dest_elem{}; - for (std::size_t elem = 0; elem < 4; ++elem) { - if (!instr.tex.IsComponentEnabled(elem)) { - // Skip disabled components - continue; - } - regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem); - ++dest_elem; + std::size_t dest_elem{}; + + shader.AddLine("vec4 texture_tmp = " + texture + ';'); + for (std::size_t elem = 0; elem < 4; ++elem) { + if (!instr.tex.IsComponentEnabled(elem)) { + // Skip disabled components + continue; } - } else { - regs.SetRegisterToFloat(instr.gpr0, 0, texture, 1, 1, false); + regs.SetRegisterToFloat(instr.gpr0, elem, "texture_tmp", 1, 4, false, false, + dest_elem); + ++dest_elem; } - --shader.scope; - shader.AddLine("}"); break; } case OpCode::Id::TLD4S: { - ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), - "AOFFI is not implemented"); + UNIMPLEMENTED_IF_MSG( + instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG( + instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + + const auto scope = shader.Scope(); + + std::string coords; const bool depth_compare = instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC); - const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); - const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20); - // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction. + const std::string sampler = GetSampler( instr.sampler, Tegra::Shader::TextureType::Texture2D, false, depth_compare); - std::string coord; - if (!depth_compare) { - coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");"; - } else { - // Note: TLD4S coordinate encoding works just like TEXS's - const std::string op_c = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec3 coords = vec3(" + op_a + ", " + op_c + ", " + op_b + ");"; - } - const std::string texture = "textureGather(" + sampler + ", coords, " + - std::to_string(instr.tld4s.component) + ')'; + + const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); + coords = "vec2 coords = vec2(" + op_a + ", "; + std::string texture = "textureGather(" + sampler + ", coords, "; if (!depth_compare) { - WriteTexsInstruction(instr, coord, texture); + const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20); + coords += op_b + ");"; + texture += std::to_string(instr.tld4s.component) + ')'; } else { - WriteTexsInstruction(instr, coord, "vec4(" + texture + ')'); + const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20); + coords += op_b + ");"; + texture += op_c + ')'; } + shader.AddLine(coords); + shader.AddLine("vec4 texture_tmp = " + texture + ';'); + WriteTexsInstructionFloat(instr, "texture_tmp"); break; } case OpCode::Id::TXQ: { - ASSERT_MSG(!instr.txq.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.txq.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + + const auto scope = shader.Scope(); - // TODO: the new commits on the texture refactor, change the way samplers work. + // TODO: The new commits on the texture refactor, change the way samplers work. // Sadly, not all texture instructions specify the type of texture their sampler // uses. This must be fixed at a later instance. const std::string sampler = GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false); switch (instr.txq.query_type) { case Tegra::Shader::TextureQueryType::Dimension: { - const std::string texture = "textureQueryLevels(" + sampler + ')'; - regs.SetRegisterToInteger(instr.gpr0, true, 0, texture, 1, 1); + const std::string texture = "textureSize(" + sampler + ", " + + regs.GetRegisterAsInteger(instr.gpr8) + ')'; + const std::string mip_level = "textureQueryLevels(" + sampler + ')'; + shader.AddLine("ivec2 sizes = " + texture + ';'); + + regs.SetRegisterToInteger(instr.gpr0.Value() + 0, true, 0, "sizes.x", 1, 1); + regs.SetRegisterToInteger(instr.gpr0.Value() + 1, true, 0, "sizes.y", 1, 1); + regs.SetRegisterToInteger(instr.gpr0.Value() + 2, true, 0, "0", 1, 1); + regs.SetRegisterToInteger(instr.gpr0.Value() + 3, true, 0, mip_level, 1, 1); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled texture query type: {}", - static_cast<u32>(instr.txq.query_type.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled texture query type: {}", + static_cast<u32>(instr.txq.query_type.Value())); } } break; } case OpCode::Id::TMML: { - ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), - "NODEP is not implemented"); - ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), - "NDV is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), + "NDV is not implemented"); const std::string x = regs.GetRegisterAsFloat(instr.gpr8); const bool is_array = instr.tmml.array != 0; @@ -2910,47 +3079,38 @@ private: const std::string sampler = GetSampler(instr.sampler, texture_type, is_array, false); - // TODO: add coordinates for different samplers once other texture types are + const auto scope = shader.Scope(); + + // TODO: Add coordinates for different samplers once other texture types are // implemented. - std::string coord; switch (texture_type) { case Tegra::Shader::TextureType::Texture1D: { - coord = "float coords = " + x + ';'; + shader.AddLine("float coords = " + x + ';'); break; } case Tegra::Shader::TextureType::Texture2D: { const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + shader.AddLine("vec2 coords = vec2(" + x + ", " + y + ");"); break; } default: - LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", - static_cast<u32>(texture_type)); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<u32>(texture_type)); // Fallback to interpreting as a 2D texture for now const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + shader.AddLine("vec2 coords = vec2(" + x + ", " + y + ");"); texture_type = Tegra::Shader::TextureType::Texture2D; } - // Add an extra scope and declare the texture coords inside to prevent - // overwriting them in case they are used as outputs of the texs instruction. - shader.AddLine('{'); - ++shader.scope; - shader.AddLine(coord); + const std::string texture = "textureQueryLod(" + sampler + ", coords)"; - const std::string tmp = "vec2 tmp = " + texture + "*vec2(256.0, 256.0);"; - shader.AddLine(tmp); + shader.AddLine("vec2 tmp = " + texture + " * vec2(256.0, 256.0);"); regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(tmp.y)", 1, 1); regs.SetRegisterToInteger(instr.gpr0.Value() + 1, false, 0, "uint(tmp.x)", 1, 1); - --shader.scope; - shader.AddLine('}'); break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled memory instruction: {}", opcode->get().GetName()); } } break; @@ -3036,14 +3196,14 @@ private: break; } case OpCode::Type::HalfSetPredicate: { - ASSERT_MSG(instr.hsetp2.ftz == 0, "Unimplemented"); + UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); const std::string op_a = GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr8, 0, false), instr.hsetp2.type_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); const std::string op_b = [&]() { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HSETP2_R: return GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr20, 0, false), instr.hsetp2.type_b, instr.hsetp2.abs_a, @@ -3081,6 +3241,9 @@ private: break; } case OpCode::Type::PredicateSetRegister: { + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in PSET is partially implemented"); + const std::string op_a = GetPredicateCondition(instr.pset.pred12, instr.pset.neg_pred12 != 0); const std::string op_b = @@ -3096,16 +3259,16 @@ private: const std::string result = '(' + predicate + ") " + combiner + " (" + second_pred + ')'; if (instr.pset.bf == 0) { const std::string value = '(' + result + ") ? 0xFFFFFFFF : 0"; - regs.SetRegisterToInteger(instr.gpr0, false, 0, value, 1, 1); + regs.SetRegisterToInteger(instr.gpr0, false, 0, value, 1, 1, false, + instr.generates_cc); } else { const std::string value = '(' + result + ") ? 1.0 : 0.0"; - regs.SetRegisterToFloat(instr.gpr0, 0, value, 1, 1); + regs.SetRegisterToFloat(instr.gpr0, 0, value, 1, 1, false, instr.generates_cc); } - break; } case OpCode::Type::PredicateSetPredicate: { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::PSETP: { const std::string op_a = GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0); @@ -3139,24 +3302,52 @@ private: const std::string pred = GetPredicateCondition(instr.csetp.pred39, instr.csetp.neg_pred39 != 0); const std::string combiner = GetPredicateCombiner(instr.csetp.op); - const std::string control_code = regs.GetControlCode(instr.csetp.cc); + const std::string condition_code = regs.GetConditionCode(instr.csetp.cc); if (instr.csetp.pred3 != static_cast<u64>(Pred::UnusedIndex)) { SetPredicate(instr.csetp.pred3, - '(' + control_code + ") " + combiner + " (" + pred + ')'); + '(' + condition_code + ") " + combiner + " (" + pred + ')'); } if (instr.csetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) { SetPredicate(instr.csetp.pred0, - "!(" + control_code + ") " + combiner + " (" + pred + ')'); + "!(" + condition_code + ") " + combiner + " (" + pred + ')'); } break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled predicate instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled predicate instruction: {}", opcode->get().GetName()); } } break; } + case OpCode::Type::RegisterSetPredicate: { + UNIMPLEMENTED_IF(instr.r2p.mode != Tegra::Shader::R2pMode::Pr); + + const std::string apply_mask = [&]() { + switch (opcode->get().GetId()) { + case OpCode::Id::R2P_IMM: + return std::to_string(instr.r2p.immediate_mask); + default: + UNREACHABLE(); + return std::to_string(instr.r2p.immediate_mask); + } + }(); + const std::string mask = '(' + regs.GetRegisterAsInteger(instr.gpr8, 0, false) + + " >> " + std::to_string(instr.r2p.byte) + ')'; + + constexpr u64 programmable_preds = 7; + for (u64 pred = 0; pred < programmable_preds; ++pred) { + const auto shift = std::to_string(1 << pred); + + shader.AddLine("if ((" + apply_mask + " & " + shift + ") != 0) {"); + ++shader.scope; + + SetPredicate(pred, '(' + mask + " & " + shift + ") != 0"); + + --shader.scope; + shader.AddLine('}'); + } + break; + } case OpCode::Type::FloatSet: { const std::string op_a = GetOperandAbsNeg(regs.GetRegisterAsFloat(instr.gpr8), instr.fset.abs_a != 0, instr.fset.neg_a != 0); @@ -3189,10 +3380,11 @@ private: ") " + combiner + " (" + second_pred + "))"; if (instr.fset.bf) { - regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1); + regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1, false, + instr.generates_cc); } else { regs.SetRegisterToInteger(instr.gpr0, false, 0, predicate + " ? 0xFFFFFFFF : 0", 1, - 1); + 1, false, instr.generates_cc); } break; } @@ -3232,14 +3424,14 @@ private: break; } case OpCode::Type::HalfSet: { - ASSERT_MSG(instr.hset2.ftz == 0, "Unimplemented"); + UNIMPLEMENTED_IF(instr.hset2.ftz != 0); const std::string op_a = GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr8, 0, false), instr.hset2.type_a, instr.hset2.abs_a != 0, instr.hset2.negate_a != 0); const std::string op_b = [&]() { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::HSET2_R: return GetHalfFloat(regs.GetRegisterAsInteger(instr.gpr20, 0, false), instr.hset2.type_b, instr.hset2.abs_b != 0, @@ -3276,19 +3468,21 @@ private: break; } case OpCode::Type::Xmad: { - ASSERT_MSG(!instr.xmad.sign_a, "Unimplemented"); - ASSERT_MSG(!instr.xmad.sign_b, "Unimplemented"); + UNIMPLEMENTED_IF(instr.xmad.sign_a); + UNIMPLEMENTED_IF(instr.xmad.sign_b); + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in XMAD is partially implemented"); std::string op_a{regs.GetRegisterAsInteger(instr.gpr8, 0, instr.xmad.sign_a)}; std::string op_b; std::string op_c; // TODO(bunnei): Needs to be fixed once op_a or op_b is signed - ASSERT_MSG(instr.xmad.sign_a == instr.xmad.sign_b, "Unimplemented"); + UNIMPLEMENTED_IF(instr.xmad.sign_a != instr.xmad.sign_b); const bool is_signed{instr.xmad.sign_a == 1}; bool is_merge{}; - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::XMAD_CR: { is_merge = instr.xmad.merge_56; op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset, @@ -3317,8 +3511,7 @@ private: break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled XMAD instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName()); } } @@ -3354,9 +3547,8 @@ private: op_c = "((" + op_c + ") + (" + src2 + "<< 16))"; break; default: { - LOG_CRITICAL(HW_GPU, "Unhandled XMAD mode: {}", - static_cast<u32>(instr.xmad.mode.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled XMAD mode: {}", + static_cast<u32>(instr.xmad.mode.Value())); } } @@ -3365,12 +3557,17 @@ private: sum = "((" + sum + " & 0xFFFF) | (" + src2 + "<< 16))"; } - regs.SetRegisterToInteger(instr.gpr0, is_signed, 0, sum, 1, 1); + regs.SetRegisterToInteger(instr.gpr0, is_signed, 0, sum, 1, 1, false, + instr.generates_cc); break; } default: { - switch (opcode->GetId()) { + switch (opcode->get().GetId()) { case OpCode::Id::EXIT: { + const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, + "EXIT condition code used: {}", static_cast<u32>(cc)); + if (stage == Maxwell3D::Regs::ShaderStage::Fragment) { EmitFragmentOutputsWrite(); } @@ -3389,18 +3586,21 @@ private: case Tegra::Shader::FlowCondition::Fcsm_Tr: // TODO(bunnei): What is this used for? If we assume this conditon is not // satisifed, dual vertex shaders in Farming Simulator make more sense - LOG_CRITICAL(HW_GPU, "Skipping unknown FlowCondition::Fcsm_Tr"); + UNIMPLEMENTED_MSG("Skipping unknown FlowCondition::Fcsm_Tr"); break; default: - LOG_CRITICAL(HW_GPU, "Unhandled flow condition: {}", - static_cast<u32>(instr.flow.cond.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled flow condition: {}", + static_cast<u32>(instr.flow.cond.Value())); } break; } case OpCode::Id::KIL: { - ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always); + UNIMPLEMENTED_IF(instr.flow.cond != Tegra::Shader::FlowCondition::Always); + + const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, + "KIL condition code used: {}", static_cast<u32>(cc)); // Enclose "discard" in a conditional, so that GLSL compilation does not complain // about unexecuted instructions that may follow this. @@ -3413,7 +3613,8 @@ private: break; } case OpCode::Id::OUT_R: { - ASSERT(instr.gpr20.Value() == Register::ZeroIndex); + UNIMPLEMENTED_IF_MSG(instr.gpr20.Value() != Register::ZeroIndex, + "Stream buffer is not supported"); ASSERT_MSG(stage == Maxwell3D::Regs::ShaderStage::Geometry, "OUT is expected to be used in a geometry shader."); @@ -3439,19 +3640,23 @@ private: regs.SetRegisterToInteger(instr.gpr0, false, 0, "0u", 1, 1); break; } + case Tegra::Shader::SystemVariable::Ydirection: { + // Config pack's third value is Y_NEGATE's state. + regs.SetRegisterToFloat(instr.gpr0, 0, "uintBitsToFloat(config_pack[2])", 1, 1); + break; + } default: { - LOG_CRITICAL(HW_GPU, "Unhandled system move: {}", - static_cast<u32>(instr.sys20.Value())); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled system move: {}", + static_cast<u32>(instr.sys20.Value())); } } break; } case OpCode::Id::ISBERD: { - ASSERT(instr.isberd.o == 0); - ASSERT(instr.isberd.skew == 0); - ASSERT(instr.isberd.shift == Tegra::Shader::IsberdShift::None); - ASSERT(instr.isberd.mode == Tegra::Shader::IsberdMode::None); + UNIMPLEMENTED_IF(instr.isberd.o != 0); + UNIMPLEMENTED_IF(instr.isberd.skew != 0); + UNIMPLEMENTED_IF(instr.isberd.shift != Tegra::Shader::IsberdShift::None); + UNIMPLEMENTED_IF(instr.isberd.mode != Tegra::Shader::IsberdMode::None); ASSERT_MSG(stage == Maxwell3D::Regs::ShaderStage::Geometry, "ISBERD is expected to be used in a geometry shader."); LOG_WARNING(HW_GPU, "ISBERD instruction is incomplete"); @@ -3459,10 +3664,21 @@ private: break; } case OpCode::Id::BRA: { - ASSERT_MSG(instr.bra.constant_buffer == 0, - "BRA with constant buffers are not implemented"); + UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, + "BRA with constant buffers are not implemented"); + + const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; const u32 target = offset + instr.bra.GetBranchTarget(); - shader.AddLine("{ jmp_to = " + std::to_string(target) + "u; break; }"); + if (cc != Tegra::Shader::ConditionCode::T) { + const std::string condition_code = regs.GetConditionCode(cc); + shader.AddLine("if (" + condition_code + "){"); + shader.scope++; + shader.AddLine("{ jmp_to = " + std::to_string(target) + "u; break; }"); + shader.scope--; + shader.AddLine('}'); + } else { + shader.AddLine("{ jmp_to = " + std::to_string(target) + "u; break; }"); + } break; } case OpCode::Id::IPA: { @@ -3483,7 +3699,8 @@ private: // The SSY opcode tells the GPU where to re-converge divergent execution paths, it // sets the target of the jump that the SYNC instruction will make. The SSY opcode // has a similar structure to the BRA opcode. - ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer flow is not supported"); + UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, + "Constant buffer flow is not supported"); const u32 target = offset + instr.bra.GetBranchTarget(); EmitPushToFlowStack(target); @@ -3493,21 +3710,28 @@ private: // PBK pushes to a stack the address where BRK will jump to. This shares stack with // SSY but using SYNC on a PBK address will kill the shader execution. We don't // emulate this because it's very unlikely a driver will emit such invalid shader. - ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer PBK is not supported"); + UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, + "Constant buffer PBK is not supported"); const u32 target = offset + instr.bra.GetBranchTarget(); EmitPushToFlowStack(target); break; } case OpCode::Id::SYNC: { + const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, + "SYNC condition code used: {}", static_cast<u32>(cc)); + // The SYNC opcode jumps to the address previously set by the SSY opcode - ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always); EmitPopFromFlowStack(); break; } case OpCode::Id::BRK: { // The BRK opcode jumps to the address previously set by the PBK opcode - ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always); + const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, + "BRK condition code used: {}", static_cast<u32>(cc)); + EmitPopFromFlowStack(); break; } @@ -3518,6 +3742,9 @@ private: break; } case OpCode::Id::VMAD: { + UNIMPLEMENTED_IF_MSG(instr.generates_cc, + "Condition codes generation in VMAD is not implemented"); + const bool result_signed = instr.video.signed_a == 1 || instr.video.signed_b == 1; const std::string op_a = GetVideoOperandA(instr); const std::string op_b = GetVideoOperandB(instr); @@ -3535,8 +3762,7 @@ private: } regs.SetRegisterToInteger(instr.gpr0, result_signed, 1, result, 1, 1, - instr.vmad.saturate == 1, 0, Register::Size::Word, - instr.vmad.cc); + instr.vmad.saturate, instr.vmad.cc); break; } case OpCode::Id::VSETP: { @@ -3564,8 +3790,8 @@ private: break; } default: { - LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName()); - UNREACHABLE(); + UNIMPLEMENTED_MSG("Unhandled instruction: {}", opcode->get().GetName()); + break; } } @@ -3691,6 +3917,7 @@ private: Maxwell3D::Regs::ShaderStage stage; const std::string& suffix; u64 local_memory_size; + std::size_t shader_length; ShaderWriter shader; ShaderWriter declarations; @@ -3705,18 +3932,19 @@ std::string GetCommonDeclarations() { RasterizerOpenGL::MaxConstbufferSize / sizeof(GLvec4)); } -boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset, - Maxwell3D::Regs::ShaderStage stage, - const std::string& suffix) { +std::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset, + Maxwell3D::Regs::ShaderStage stage, + const std::string& suffix) { try { - const auto subroutines = - ControlFlowAnalyzer(program_code, main_offset, suffix).GetSubroutines(); - GLSLGenerator generator(subroutines, program_code, main_offset, stage, suffix); + ControlFlowAnalyzer analyzer(program_code, main_offset, suffix); + const auto subroutines = analyzer.GetSubroutines(); + GLSLGenerator generator(subroutines, program_code, main_offset, stage, suffix, + analyzer.GetShaderLength()); return ProgramResult{generator.GetShaderCode(), generator.GetEntries()}; } catch (const DecompileFail& exception) { LOG_ERROR(HW_GPU, "Shader decompilation failed: {}", exception.what()); } - return boost::none; + return {}; } } // namespace OpenGL::GLShader::Decompiler diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index b20cc4bfa..d01a4a7ee 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -6,8 +6,8 @@ #include <array> #include <functional> +#include <optional> #include <string> -#include <boost/optional.hpp> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -18,8 +18,8 @@ using Tegra::Engines::Maxwell3D; std::string GetCommonDeclarations(); -boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset, - Maxwell3D::Regs::ShaderStage stage, - const std::string& suffix); +std::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset, + Maxwell3D::Regs::ShaderStage stage, + const std::string& suffix); } // namespace OpenGL::GLShader::Decompiler diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index dfb562706..5d0819dc5 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <fmt/format.h> #include "common/assert.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" @@ -16,6 +17,8 @@ static constexpr u32 PROGRAM_OFFSET{10}; ProgramResult GenerateVertexShader(const ShaderSetup& setup) { std::string out = "#version 430 core\n"; out += "#extension GL_ARB_separate_shader_objects : enable\n\n"; + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + out += "// Shader Unique Id: VS" + id + "\n\n"; out += Decompiler::GetCommonDeclarations(); out += R"( @@ -24,8 +27,7 @@ layout (location = 0) out vec4 position; layout(std140) uniform vs_config { vec4 viewport_flip; - uvec4 instance_id; - uvec4 flip_stage; + uvec4 config_pack; // instance_id, flip_stage, y_direction, padding uvec4 alpha_test; }; )"; @@ -37,7 +39,7 @@ layout(std140) uniform vs_config { ProgramResult program = Decompiler::DecompileProgram(setup.program.code, PROGRAM_OFFSET, Maxwell3D::Regs::ShaderStage::Vertex, "vertex") - .get_value_or({}); + .value_or(ProgramResult()); out += program.first; @@ -45,7 +47,7 @@ layout(std140) uniform vs_config { ProgramResult program_b = Decompiler::DecompileProgram(setup.program.code_b, PROGRAM_OFFSET, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b") - .get_value_or({}); + .value_or(ProgramResult()); out += program_b.first; } @@ -63,7 +65,8 @@ void main() { out += R"( // Check if the flip stage is VertexB - if (flip_stage[0] == 1) { + // Config pack's second value is flip_stage + if (config_pack[1] == 1) { // Viewport can be flipped, which is unsupported by glViewport position.xy *= viewport_flip.xy; } @@ -71,7 +74,7 @@ void main() { // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0 // For now, this is here to bring order in lieu of proper emulation - if (flip_stage[0] == 1) { + if (config_pack[1] == 1) { position.w = 1.0; } } @@ -82,15 +85,17 @@ void main() { } ProgramResult GenerateGeometryShader(const ShaderSetup& setup) { - std::string out = "#version 430 core\n"; - out += "#extension GL_ARB_separate_shader_objects : enable\n\n"; + // Version is intentionally skipped in shader generation, it's added by the lazy compilation. + std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + out += "// Shader Unique Id: GS" + id + "\n\n"; out += Decompiler::GetCommonDeclarations(); out += "bool exec_geometry();\n"; ProgramResult program = Decompiler::DecompileProgram(setup.program.code, PROGRAM_OFFSET, Maxwell3D::Regs::ShaderStage::Geometry, "geometry") - .get_value_or({}); + .value_or(ProgramResult()); out += R"( out gl_PerVertex { vec4 gl_Position; @@ -101,8 +106,7 @@ layout (location = 0) out vec4 position; layout (std140) uniform gs_config { vec4 viewport_flip; - uvec4 instance_id; - uvec4 flip_stage; + uvec4 config_pack; // instance_id, flip_stage, y_direction, padding uvec4 alpha_test; }; @@ -118,13 +122,15 @@ void main() { ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { std::string out = "#version 430 core\n"; out += "#extension GL_ARB_separate_shader_objects : enable\n\n"; + const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); + out += "// Shader Unique Id: FS" + id + "\n\n"; out += Decompiler::GetCommonDeclarations(); out += "bool exec_fragment();\n"; ProgramResult program = Decompiler::DecompileProgram(setup.program.code, PROGRAM_OFFSET, Maxwell3D::Regs::ShaderStage::Fragment, "fragment") - .get_value_or({}); + .value_or(ProgramResult()); out += R"( layout(location = 0) out vec4 FragColor0; layout(location = 1) out vec4 FragColor1; @@ -139,8 +145,7 @@ layout (location = 0) in vec4 position; layout (std140) uniform fs_config { vec4 viewport_flip; - uvec4 instance_id; - uvec4 flip_stage; + uvec4 config_pack; // instance_id, flip_stage, y_direction, padding uvec4 alpha_test; }; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index 520b9d4e3..fcc20d3b4 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -163,6 +163,8 @@ private: struct ShaderEntries { std::vector<ConstBufferEntry> const_buffer_entries; std::vector<SamplerEntry> texture_samplers; + std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> clip_distances; + std::size_t shader_length; }; using ProgramResult = std::pair<std::string, ShaderEntries>; @@ -175,6 +177,9 @@ struct ShaderSetup { struct { ProgramCode code; ProgramCode code_b; // Used for dual vertex shaders + u64 unique_identifier; + std::size_t real_size; + std::size_t real_size_b; } program; /// Used in scenarios where we have a dual vertex shaders diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 8b8869ecb..6a30c28d2 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -27,16 +27,18 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh alpha_test.func = func; alpha_test.ref = regs.alpha_test_ref; - // We only assign the instance to the first component of the vector, the rest is just padding. - instance_id[0] = state.current_instance; + instance_id = state.current_instance; // Assign in which stage the position has to be flipped // (the last stage before the fragment shader). if (gpu.regs.shader_config[static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry)].enable) { - flip_stage[0] = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry); + flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry); } else { - flip_stage[0] = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB); + flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB); } + + // Y_NEGATE controls what value S2R returns for the Y_DIRECTION system value. + y_direction = regs.screen_y_control.y_negate == 0 ? 1.f : -1.f; } } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 36fe1f04c..4970aafed 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -7,6 +7,7 @@ #include <glad/glad.h> #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" namespace OpenGL::GLShader { @@ -20,8 +21,11 @@ using Tegra::Engines::Maxwell3D; struct MaxwellUniformData { void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage); alignas(16) GLvec4 viewport_flip; - alignas(16) GLuvec4 instance_id; - alignas(16) GLuvec4 flip_stage; + struct alignas(16) { + GLuint instance_id; + GLuint flip_stage; + GLfloat y_direction; + }; struct alignas(16) { GLuint enabled; GLuint func; @@ -29,7 +33,7 @@ struct MaxwellUniformData { GLuint padding; } alpha_test; }; -static_assert(sizeof(MaxwellUniformData) == 64, "MaxwellUniformData structure size is incorrect"); +static_assert(sizeof(MaxwellUniformData) == 48, "MaxwellUniformData structure size is incorrect"); static_assert(sizeof(MaxwellUniformData) < 16384, "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec"); @@ -56,6 +60,17 @@ public: } void ApplyTo(OpenGLState& state) { + UpdatePipeline(); + state.draw.shader_program = 0; + state.draw.program_pipeline = pipeline.handle; + state.geometry_shaders.enabled = (gs != 0); + } + +private: + void UpdatePipeline() { + // Avoid updating the pipeline when values have no changed + if (old_vs == vs && old_fs == fs && old_gs == gs) + return; // Workaround for AMD bug glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | GL_FRAGMENT_SHADER_BIT, @@ -64,13 +79,16 @@ public: glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vs); glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, gs); glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fs); - state.draw.shader_program = 0; - state.draw.program_pipeline = pipeline.handle; + + // Update the old values + old_vs = vs; + old_fs = fs; + old_gs = gs; } -private: OGLPipeline pipeline; GLuint vs{}, fs{}, gs{}; + GLuint old_vs{}, old_fs{}, old_gs{}; }; } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index d8a43cc94..dc0a5ed5e 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -14,7 +14,10 @@ OpenGLState OpenGLState::cur_state; bool OpenGLState::s_rgb_used; OpenGLState::OpenGLState() { // These all match default OpenGL values + geometry_shaders.enabled = false; framebuffer_srgb.enabled = false; + multisample_control.alpha_to_coverage = false; + multisample_control.alpha_to_one = false; cull.enabled = false; cull.mode = GL_BACK; cull.front_face = GL_CCW; @@ -22,17 +25,15 @@ OpenGLState::OpenGLState() { depth.test_enabled = false; depth.test_func = GL_LESS; depth.write_mask = GL_TRUE; - depth.depth_range_near = 0.0f; - depth.depth_range_far = 1.0f; primitive_restart.enabled = false; primitive_restart.index = 0; - - color_mask.red_enabled = GL_TRUE; - color_mask.green_enabled = GL_TRUE; - color_mask.blue_enabled = GL_TRUE; - color_mask.alpha_enabled = GL_TRUE; - + for (auto& item : color_mask) { + item.red_enabled = GL_TRUE; + item.green_enabled = GL_TRUE; + item.blue_enabled = GL_TRUE; + item.alpha_enabled = GL_TRUE; + } stencil.test_enabled = false; auto reset_stencil = [](auto& config) { config.test_func = GL_ALWAYS; @@ -45,19 +46,33 @@ OpenGLState::OpenGLState() { }; reset_stencil(stencil.front); reset_stencil(stencil.back); - - blend.enabled = true; - blend.rgb_equation = GL_FUNC_ADD; - blend.a_equation = GL_FUNC_ADD; - blend.src_rgb_func = GL_ONE; - blend.dst_rgb_func = GL_ZERO; - blend.src_a_func = GL_ONE; - blend.dst_a_func = GL_ZERO; - blend.color.red = 0.0f; - blend.color.green = 0.0f; - blend.color.blue = 0.0f; - blend.color.alpha = 0.0f; - + for (auto& item : viewports) { + item.x = 0; + item.y = 0; + item.width = 0; + item.height = 0; + item.depth_range_near = 0.0f; + item.depth_range_far = 1.0f; + item.scissor.enabled = false; + item.scissor.x = 0; + item.scissor.y = 0; + item.scissor.width = 0; + item.scissor.height = 0; + } + for (auto& item : blend) { + item.enabled = true; + item.rgb_equation = GL_FUNC_ADD; + item.a_equation = GL_FUNC_ADD; + item.src_rgb_func = GL_ONE; + item.dst_rgb_func = GL_ZERO; + item.src_a_func = GL_ONE; + item.dst_a_func = GL_ZERO; + } + independant_blend.enabled = false; + blend_color.red = 0.0f; + blend_color.green = 0.0f; + blend_color.blue = 0.0f; + blend_color.alpha = 0.0f; logic_op.enabled = false; logic_op.operation = GL_COPY; @@ -73,23 +88,32 @@ OpenGLState::OpenGLState() { draw.shader_program = 0; draw.program_pipeline = 0; - scissor.enabled = false; - scissor.x = 0; - scissor.y = 0; - scissor.width = 0; - scissor.height = 0; - - viewport.x = 0; - viewport.y = 0; - viewport.width = 0; - viewport.height = 0; - clip_distance = {}; point.size = 1; + fragment_color_clamp.enabled = false; + depth_clamp.far_plane = false; + depth_clamp.near_plane = false; + polygon_offset.fill_enable = false; + polygon_offset.line_enable = false; + polygon_offset.point_enable = false; + polygon_offset.factor = 0.0f; + polygon_offset.units = 0.0f; + polygon_offset.clamp = 0.0f; } -void OpenGLState::Apply() const { +void OpenGLState::ApplyDefaultState() { + glDisable(GL_FRAMEBUFFER_SRGB); + glDisable(GL_CULL_FACE); + glDisable(GL_DEPTH_TEST); + glDisable(GL_PRIMITIVE_RESTART); + glDisable(GL_STENCIL_TEST); + glEnable(GL_BLEND); + glDisable(GL_COLOR_LOGIC_OP); + glDisable(GL_SCISSOR_TEST); +} + +void OpenGLState::ApplySRgb() const { // sRGB if (framebuffer_srgb.enabled != cur_state.framebuffer_srgb.enabled) { if (framebuffer_srgb.enabled) { @@ -100,139 +124,346 @@ void OpenGLState::Apply() const { glDisable(GL_FRAMEBUFFER_SRGB); } } +} + +void OpenGLState::ApplyCulling() const { // Culling - if (cull.enabled != cur_state.cull.enabled) { + const bool cull_changed = cull.enabled != cur_state.cull.enabled; + if (cull_changed) { if (cull.enabled) { glEnable(GL_CULL_FACE); } else { glDisable(GL_CULL_FACE); } } + if (cull.enabled) { + if (cull_changed || cull.mode != cur_state.cull.mode) { + glCullFace(cull.mode); + } - if (cull.mode != cur_state.cull.mode) { - glCullFace(cull.mode); + if (cull_changed || cull.front_face != cur_state.cull.front_face) { + glFrontFace(cull.front_face); + } } +} - if (cull.front_face != cur_state.cull.front_face) { - glFrontFace(cull.front_face); +void OpenGLState::ApplyColorMask() const { + if (independant_blend.enabled) { + for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { + const auto& updated = color_mask[i]; + const auto& current = cur_state.color_mask[i]; + if (updated.red_enabled != current.red_enabled || + updated.green_enabled != current.green_enabled || + updated.blue_enabled != current.blue_enabled || + updated.alpha_enabled != current.alpha_enabled) { + glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled, + updated.blue_enabled, updated.alpha_enabled); + } + } + } else { + const auto& updated = color_mask[0]; + const auto& current = cur_state.color_mask[0]; + if (updated.red_enabled != current.red_enabled || + updated.green_enabled != current.green_enabled || + updated.blue_enabled != current.blue_enabled || + updated.alpha_enabled != current.alpha_enabled) { + glColorMask(updated.red_enabled, updated.green_enabled, updated.blue_enabled, + updated.alpha_enabled); + } } +} +void OpenGLState::ApplyDepth() const { // Depth test - if (depth.test_enabled != cur_state.depth.test_enabled) { + const bool depth_test_changed = depth.test_enabled != cur_state.depth.test_enabled; + if (depth_test_changed) { if (depth.test_enabled) { glEnable(GL_DEPTH_TEST); } else { glDisable(GL_DEPTH_TEST); } } - - if (depth.test_func != cur_state.depth.test_func) { + if (depth.test_enabled && + (depth_test_changed || depth.test_func != cur_state.depth.test_func)) { glDepthFunc(depth.test_func); } - // Depth mask if (depth.write_mask != cur_state.depth.write_mask) { glDepthMask(depth.write_mask); } +} - // Depth range - if (depth.depth_range_near != cur_state.depth.depth_range_near || - depth.depth_range_far != cur_state.depth.depth_range_far) { - glDepthRange(depth.depth_range_near, depth.depth_range_far); - } - - // Primitive restart - if (primitive_restart.enabled != cur_state.primitive_restart.enabled) { +void OpenGLState::ApplyPrimitiveRestart() const { + const bool primitive_restart_changed = + primitive_restart.enabled != cur_state.primitive_restart.enabled; + if (primitive_restart_changed) { if (primitive_restart.enabled) { glEnable(GL_PRIMITIVE_RESTART); } else { glDisable(GL_PRIMITIVE_RESTART); } } - if (primitive_restart.index != cur_state.primitive_restart.index) { + if (primitive_restart_changed || + (primitive_restart.enabled && + primitive_restart.index != cur_state.primitive_restart.index)) { glPrimitiveRestartIndex(primitive_restart.index); } +} - // Color mask - if (color_mask.red_enabled != cur_state.color_mask.red_enabled || - color_mask.green_enabled != cur_state.color_mask.green_enabled || - color_mask.blue_enabled != cur_state.color_mask.blue_enabled || - color_mask.alpha_enabled != cur_state.color_mask.alpha_enabled) { - glColorMask(color_mask.red_enabled, color_mask.green_enabled, color_mask.blue_enabled, - color_mask.alpha_enabled); - } - - // Stencil test - if (stencil.test_enabled != cur_state.stencil.test_enabled) { +void OpenGLState::ApplyStencilTest() const { + const bool stencil_test_changed = stencil.test_enabled != cur_state.stencil.test_enabled; + if (stencil_test_changed) { if (stencil.test_enabled) { glEnable(GL_STENCIL_TEST); } else { glDisable(GL_STENCIL_TEST); } } - auto config_stencil = [](GLenum face, const auto& config, const auto& prev_config) { - if (config.test_func != prev_config.test_func || config.test_ref != prev_config.test_ref || - config.test_mask != prev_config.test_mask) { - glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask); + if (stencil.test_enabled) { + auto config_stencil = [stencil_test_changed](GLenum face, const auto& config, + const auto& prev_config) { + if (stencil_test_changed || config.test_func != prev_config.test_func || + config.test_ref != prev_config.test_ref || + config.test_mask != prev_config.test_mask) { + glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask); + } + if (stencil_test_changed || config.action_depth_fail != prev_config.action_depth_fail || + config.action_depth_pass != prev_config.action_depth_pass || + config.action_stencil_fail != prev_config.action_stencil_fail) { + glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail, + config.action_depth_pass); + } + if (config.write_mask != prev_config.write_mask) { + glStencilMaskSeparate(face, config.write_mask); + } + }; + config_stencil(GL_FRONT, stencil.front, cur_state.stencil.front); + config_stencil(GL_BACK, stencil.back, cur_state.stencil.back); + } +} +// Viewport does not affects glClearBuffer so emulate viewport using scissor test +void OpenGLState::EmulateViewportWithScissor() { + auto& current = viewports[0]; + if (current.scissor.enabled) { + const GLint left = std::max(current.x, current.scissor.x); + const GLint right = + std::max(current.x + current.width, current.scissor.x + current.scissor.width); + const GLint bottom = std::max(current.y, current.scissor.y); + const GLint top = + std::max(current.y + current.height, current.scissor.y + current.scissor.height); + current.scissor.x = std::max(left, 0); + current.scissor.y = std::max(bottom, 0); + current.scissor.width = std::max(right - left, 0); + current.scissor.height = std::max(top - bottom, 0); + } else { + current.scissor.enabled = true; + current.scissor.x = current.x; + current.scissor.y = current.y; + current.scissor.width = current.width; + current.scissor.height = current.height; + } +} + +void OpenGLState::ApplyViewport() const { + if (geometry_shaders.enabled) { + for (GLuint i = 0; i < static_cast<GLuint>(Tegra::Engines::Maxwell3D::Regs::NumViewports); + i++) { + const auto& current = cur_state.viewports[i]; + const auto& updated = viewports[i]; + if (updated.x != current.x || updated.y != current.y || + updated.width != current.width || updated.height != current.height) { + glViewportIndexedf( + i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y), + static_cast<GLfloat>(updated.width), static_cast<GLfloat>(updated.height)); + } + if (updated.depth_range_near != current.depth_range_near || + updated.depth_range_far != current.depth_range_far) { + glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far); + } + const bool scissor_changed = updated.scissor.enabled != current.scissor.enabled; + if (scissor_changed) { + if (updated.scissor.enabled) { + glEnablei(GL_SCISSOR_TEST, i); + } else { + glDisablei(GL_SCISSOR_TEST, i); + } + } + if (updated.scissor.enabled && + (scissor_changed || updated.scissor.x != current.scissor.x || + updated.scissor.y != current.scissor.y || + updated.scissor.width != current.scissor.width || + updated.scissor.height != current.scissor.height)) { + glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width, + updated.scissor.height); + } } - if (config.action_depth_fail != prev_config.action_depth_fail || - config.action_depth_pass != prev_config.action_depth_pass || - config.action_stencil_fail != prev_config.action_stencil_fail) { - glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail, - config.action_depth_pass); + } else { + const auto& current = cur_state.viewports[0]; + const auto& updated = viewports[0]; + if (updated.x != current.x || updated.y != current.y || updated.width != current.width || + updated.height != current.height) { + glViewport(updated.x, updated.y, updated.width, updated.height); } - if (config.write_mask != prev_config.write_mask) { - glStencilMaskSeparate(face, config.write_mask); + if (updated.depth_range_near != current.depth_range_near || + updated.depth_range_far != current.depth_range_far) { + glDepthRange(updated.depth_range_near, updated.depth_range_far); } - }; - config_stencil(GL_FRONT, stencil.front, cur_state.stencil.front); - config_stencil(GL_BACK, stencil.back, cur_state.stencil.back); + const bool scissor_changed = updated.scissor.enabled != current.scissor.enabled; + if (scissor_changed) { + if (updated.scissor.enabled) { + glEnable(GL_SCISSOR_TEST); + } else { + glDisable(GL_SCISSOR_TEST); + } + } + if (updated.scissor.enabled && (scissor_changed || updated.scissor.x != current.scissor.x || + updated.scissor.y != current.scissor.y || + updated.scissor.width != current.scissor.width || + updated.scissor.height != current.scissor.height)) { + glScissor(updated.scissor.x, updated.scissor.y, updated.scissor.width, + updated.scissor.height); + } + } +} - // Blending - if (blend.enabled != cur_state.blend.enabled) { - if (blend.enabled) { - ASSERT(!logic_op.enabled); +void OpenGLState::ApplyGlobalBlending() const { + const Blend& current = cur_state.blend[0]; + const Blend& updated = blend[0]; + const bool blend_changed = updated.enabled != current.enabled; + if (blend_changed) { + if (updated.enabled) { glEnable(GL_BLEND); } else { glDisable(GL_BLEND); } } + if (!updated.enabled) { + return; + } + if (blend_changed || updated.src_rgb_func != current.src_rgb_func || + updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func || + updated.dst_a_func != current.dst_a_func) { + glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, + updated.dst_a_func); + } - if (blend.color.red != cur_state.blend.color.red || - blend.color.green != cur_state.blend.color.green || - blend.color.blue != cur_state.blend.color.blue || - blend.color.alpha != cur_state.blend.color.alpha) { - glBlendColor(blend.color.red, blend.color.green, blend.color.blue, blend.color.alpha); + if (blend_changed || updated.rgb_equation != current.rgb_equation || + updated.a_equation != current.a_equation) { + glBlendEquationSeparate(updated.rgb_equation, updated.a_equation); + } +} + +void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const { + const Blend& updated = blend[target]; + const Blend& current = cur_state.blend[target]; + const bool blend_changed = updated.enabled != current.enabled || force; + if (blend_changed) { + if (updated.enabled) { + glEnablei(GL_BLEND, static_cast<GLuint>(target)); + } else { + glDisablei(GL_BLEND, static_cast<GLuint>(target)); + } + } + if (!updated.enabled) { + return; + } + if (blend_changed || updated.src_rgb_func != current.src_rgb_func || + updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func || + updated.dst_a_func != current.dst_a_func) { + glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func, + updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func); } - if (blend.src_rgb_func != cur_state.blend.src_rgb_func || - blend.dst_rgb_func != cur_state.blend.dst_rgb_func || - blend.src_a_func != cur_state.blend.src_a_func || - blend.dst_a_func != cur_state.blend.dst_a_func) { - glBlendFuncSeparate(blend.src_rgb_func, blend.dst_rgb_func, blend.src_a_func, - blend.dst_a_func); + if (blend_changed || updated.rgb_equation != current.rgb_equation || + updated.a_equation != current.a_equation) { + glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation, + updated.a_equation); } +} - if (blend.rgb_equation != cur_state.blend.rgb_equation || - blend.a_equation != cur_state.blend.a_equation) { - glBlendEquationSeparate(blend.rgb_equation, blend.a_equation); +void OpenGLState::ApplyBlending() const { + if (independant_blend.enabled) { + for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { + ApplyTargetBlending(i, + independant_blend.enabled != cur_state.independant_blend.enabled); + } + } else { + ApplyGlobalBlending(); + } + if (blend_color.red != cur_state.blend_color.red || + blend_color.green != cur_state.blend_color.green || + blend_color.blue != cur_state.blend_color.blue || + blend_color.alpha != cur_state.blend_color.alpha) { + glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha); } +} - // Logic Operation - if (logic_op.enabled != cur_state.logic_op.enabled) { +void OpenGLState::ApplyLogicOp() const { + const bool logic_op_changed = logic_op.enabled != cur_state.logic_op.enabled; + if (logic_op_changed) { if (logic_op.enabled) { - ASSERT(!blend.enabled); glEnable(GL_COLOR_LOGIC_OP); } else { glDisable(GL_COLOR_LOGIC_OP); } } - if (logic_op.operation != cur_state.logic_op.operation) { + if (logic_op.enabled && + (logic_op_changed || logic_op.operation != cur_state.logic_op.operation)) { glLogicOp(logic_op.operation); } +} + +void OpenGLState::ApplyPolygonOffset() const { + + const bool fill_enable_changed = + polygon_offset.fill_enable != cur_state.polygon_offset.fill_enable; + const bool line_enable_changed = + polygon_offset.line_enable != cur_state.polygon_offset.line_enable; + const bool point_enable_changed = + polygon_offset.point_enable != cur_state.polygon_offset.point_enable; + const bool factor_changed = polygon_offset.factor != cur_state.polygon_offset.factor; + const bool units_changed = polygon_offset.units != cur_state.polygon_offset.units; + const bool clamp_changed = polygon_offset.clamp != cur_state.polygon_offset.clamp; + + if (fill_enable_changed) { + if (polygon_offset.fill_enable) { + glEnable(GL_POLYGON_OFFSET_FILL); + } else { + glDisable(GL_POLYGON_OFFSET_FILL); + } + } + + if (line_enable_changed) { + if (polygon_offset.line_enable) { + glEnable(GL_POLYGON_OFFSET_LINE); + } else { + glDisable(GL_POLYGON_OFFSET_LINE); + } + } - // Textures + if (point_enable_changed) { + if (polygon_offset.point_enable) { + glEnable(GL_POLYGON_OFFSET_POINT); + } else { + glDisable(GL_POLYGON_OFFSET_POINT); + } + } + + if ((polygon_offset.fill_enable || polygon_offset.line_enable || polygon_offset.point_enable) && + (factor_changed || units_changed || clamp_changed)) { + + if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) { + glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp); + } else { + glPolygonOffset(polygon_offset.factor, polygon_offset.units); + UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0, + "Unimplemented Depth polygon offset clamp."); + } + } +} + +void OpenGLState::ApplyTextures() const { for (std::size_t i = 0; i < std::size(texture_units); ++i) { const auto& texture_unit = texture_units[i]; const auto& cur_state_texture_unit = cur_state.texture_units[i]; @@ -251,28 +482,29 @@ void OpenGLState::Apply() const { glTexParameteriv(texture_unit.target, GL_TEXTURE_SWIZZLE_RGBA, mask.data()); } } +} - // Samplers - { - bool has_delta{}; - std::size_t first{}, last{}; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers; - for (std::size_t i = 0; i < std::size(samplers); ++i) { - samplers[i] = texture_units[i].sampler; - if (samplers[i] != cur_state.texture_units[i].sampler) { - if (!has_delta) { - first = i; - has_delta = true; - } - last = i; +void OpenGLState::ApplySamplers() const { + bool has_delta{}; + std::size_t first{}, last{}; + std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers; + for (std::size_t i = 0; i < std::size(samplers); ++i) { + samplers[i] = texture_units[i].sampler; + if (samplers[i] != cur_state.texture_units[i].sampler) { + if (!has_delta) { + first = i; + has_delta = true; } + last = i; } - if (has_delta) { - glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), - samplers.data()); - } } + if (has_delta) { + glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), + samplers.data()); + } +} +void OpenGLState::ApplyFramebufferState() const { // Framebuffer if (draw.read_framebuffer != cur_state.draw.read_framebuffer) { glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer); @@ -280,7 +512,9 @@ void OpenGLState::Apply() const { if (draw.draw_framebuffer != cur_state.draw.draw_framebuffer) { glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer); } +} +void OpenGLState::ApplyVertexBufferState() const { // Vertex array if (draw.vertex_array != cur_state.draw.vertex_array) { glBindVertexArray(draw.vertex_array); @@ -290,7 +524,26 @@ void OpenGLState::Apply() const { if (draw.vertex_buffer != cur_state.draw.vertex_buffer) { glBindBuffer(GL_ARRAY_BUFFER, draw.vertex_buffer); } +} + +void OpenGLState::ApplyDepthClamp() const { + if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane && + depth_clamp.near_plane == cur_state.depth_clamp.near_plane) { + return; + } + if (depth_clamp.far_plane != depth_clamp.near_plane) { + UNIMPLEMENTED_MSG("Unimplemented Depth Clamp Separation!"); + } + if (depth_clamp.far_plane || depth_clamp.near_plane) { + glEnable(GL_DEPTH_CLAMP); + } else { + glDisable(GL_DEPTH_CLAMP); + } +} +void OpenGLState::Apply() const { + ApplyFramebufferState(); + ApplyVertexBufferState(); // Uniform buffer if (draw.uniform_buffer != cur_state.draw.uniform_buffer) { glBindBuffer(GL_UNIFORM_BUFFER, draw.uniform_buffer); @@ -305,27 +558,6 @@ void OpenGLState::Apply() const { if (draw.program_pipeline != cur_state.draw.program_pipeline) { glBindProgramPipeline(draw.program_pipeline); } - - // Scissor test - if (scissor.enabled != cur_state.scissor.enabled) { - if (scissor.enabled) { - glEnable(GL_SCISSOR_TEST); - } else { - glDisable(GL_SCISSOR_TEST); - } - } - - if (scissor.x != cur_state.scissor.x || scissor.y != cur_state.scissor.y || - scissor.width != cur_state.scissor.width || scissor.height != cur_state.scissor.height) { - glScissor(scissor.x, scissor.y, scissor.width, scissor.height); - } - - if (viewport.x != cur_state.viewport.x || viewport.y != cur_state.viewport.y || - viewport.width != cur_state.viewport.width || - viewport.height != cur_state.viewport.height) { - glViewport(viewport.x, viewport.y, viewport.width, viewport.height); - } - // Clip distance for (std::size_t i = 0; i < clip_distance.size(); ++i) { if (clip_distance[i] != cur_state.clip_distance[i]) { @@ -336,12 +568,41 @@ void OpenGLState::Apply() const { } } } - // Point if (point.size != cur_state.point.size) { glPointSize(point.size); } - + if (fragment_color_clamp.enabled != cur_state.fragment_color_clamp.enabled) { + glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB, + fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE); + } + if (multisample_control.alpha_to_coverage != cur_state.multisample_control.alpha_to_coverage) { + if (multisample_control.alpha_to_coverage) { + glEnable(GL_SAMPLE_ALPHA_TO_COVERAGE); + } else { + glDisable(GL_SAMPLE_ALPHA_TO_COVERAGE); + } + } + if (multisample_control.alpha_to_one != cur_state.multisample_control.alpha_to_one) { + if (multisample_control.alpha_to_one) { + glEnable(GL_SAMPLE_ALPHA_TO_ONE); + } else { + glDisable(GL_SAMPLE_ALPHA_TO_ONE); + } + } + ApplyDepthClamp(); + ApplyColorMask(); + ApplyViewport(); + ApplyStencilTest(); + ApplySRgb(); + ApplyCulling(); + ApplyDepth(); + ApplyPrimitiveRestart(); + ApplyBlending(); + ApplyLogicOp(); + ApplyTextures(); + ApplySamplers(); + ApplyPolygonOffset(); cur_state = *this; } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 9e2c573b5..439bfbc98 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -40,17 +40,33 @@ public: } framebuffer_srgb; struct { + bool alpha_to_coverage; // GL_ALPHA_TO_COVERAGE + bool alpha_to_one; // GL_ALPHA_TO_ONE + } multisample_control; + + struct { + bool enabled; // GL_CLAMP_FRAGMENT_COLOR_ARB + } fragment_color_clamp; + + struct { + bool far_plane; + bool near_plane; + } depth_clamp; // GL_DEPTH_CLAMP + + struct { + bool enabled; // viewports arrays are only supported when geometry shaders are enabled. + } geometry_shaders; + + struct { bool enabled; // GL_CULL_FACE GLenum mode; // GL_CULL_FACE_MODE GLenum front_face; // GL_FRONT_FACE } cull; struct { - bool test_enabled; // GL_DEPTH_TEST - GLenum test_func; // GL_DEPTH_FUNC - GLboolean write_mask; // GL_DEPTH_WRITEMASK - GLfloat depth_range_near; // GL_DEPTH_RANGE - GLfloat depth_range_far; // GL_DEPTH_RANGE + bool test_enabled; // GL_DEPTH_TEST + GLenum test_func; // GL_DEPTH_FUNC + GLboolean write_mask; // GL_DEPTH_WRITEMASK } depth; struct { @@ -58,13 +74,14 @@ public: GLuint index; } primitive_restart; // GL_PRIMITIVE_RESTART - struct { + struct ColorMask { GLboolean red_enabled; GLboolean green_enabled; GLboolean blue_enabled; GLboolean alpha_enabled; - } color_mask; // GL_COLOR_WRITEMASK - + }; + std::array<ColorMask, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> + color_mask; // GL_COLOR_WRITEMASK struct { bool test_enabled; // GL_STENCIL_TEST struct { @@ -78,7 +95,7 @@ public: } front, back; } stencil; - struct { + struct Blend { bool enabled; // GL_BLEND GLenum rgb_equation; // GL_BLEND_EQUATION_RGB GLenum a_equation; // GL_BLEND_EQUATION_ALPHA @@ -86,14 +103,19 @@ public: GLenum dst_rgb_func; // GL_BLEND_DST_RGB GLenum src_a_func; // GL_BLEND_SRC_ALPHA GLenum dst_a_func; // GL_BLEND_DST_ALPHA + }; + std::array<Blend, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> blend; - struct { - GLclampf red; - GLclampf green; - GLclampf blue; - GLclampf alpha; - } color; // GL_BLEND_COLOR - } blend; + struct { + bool enabled; + } independant_blend; + + struct { + GLclampf red; + GLclampf green; + GLclampf blue; + GLclampf alpha; + } blend_color; // GL_BLEND_COLOR struct { bool enabled; // GL_LOGIC_OP_MODE @@ -138,26 +160,37 @@ public: GLuint program_pipeline; // GL_PROGRAM_PIPELINE_BINDING } draw; - struct { - bool enabled; // GL_SCISSOR_TEST - GLint x; - GLint y; - GLsizei width; - GLsizei height; - } scissor; - - struct { + struct viewport { GLint x; GLint y; - GLsizei width; - GLsizei height; - } viewport; + GLint width; + GLint height; + GLfloat depth_range_near; // GL_DEPTH_RANGE + GLfloat depth_range_far; // GL_DEPTH_RANGE + struct { + bool enabled; // GL_SCISSOR_TEST + GLint x; + GLint y; + GLsizei width; + GLsizei height; + } scissor; + }; + std::array<viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports; struct { float size; // GL_POINT_SIZE } point; - std::array<bool, 2> clip_distance; // GL_CLIP_DISTANCE + struct { + bool point_enable; + bool line_enable; + bool fill_enable; + GLfloat units; + GLfloat factor; + GLfloat clamp; + } polygon_offset; + + std::array<bool, 8> clip_distance; // GL_CLIP_DISTANCE OpenGLState(); @@ -173,7 +206,12 @@ public: } /// Apply this state as the current OpenGL state void Apply() const; - + /// Apply only the state afecting the framebuffer + void ApplyFramebufferState() const; + /// Apply only the state afecting the vertex buffer + void ApplyVertexBufferState() const; + /// Set the initial OpenGL state + static void ApplyDefaultState(); /// Resets any references to the given resource OpenGLState& UnbindTexture(GLuint handle); OpenGLState& ResetSampler(GLuint handle); @@ -182,12 +220,28 @@ public: OpenGLState& ResetBuffer(GLuint handle); OpenGLState& ResetVertexArray(GLuint handle); OpenGLState& ResetFramebuffer(GLuint handle); + void EmulateViewportWithScissor(); private: static OpenGLState cur_state; // Workaround for sRGB problems caused by // QT not supporting srgb output static bool s_rgb_used; + void ApplySRgb() const; + void ApplyCulling() const; + void ApplyColorMask() const; + void ApplyDepth() const; + void ApplyPrimitiveRestart() const; + void ApplyStencilTest() const; + void ApplyViewport() const; + void ApplyTargetBlending(std::size_t target, bool force) const; + void ApplyGlobalBlending() const; + void ApplyBlending() const; + void ApplyLogicOp() const; + void ApplyTextures() const; + void ApplySamplers() const; + void ApplyDepthClamp() const; + void ApplyPolygonOffset() const; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index e409228cc..b97b895a4 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -6,9 +6,13 @@ #include <vector> #include "common/alignment.h" #include "common/assert.h" +#include "common/microprofile.h" #include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" +MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", + MP_RGB(128, 128, 192)); + namespace OpenGL { OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coherent) @@ -75,6 +79,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a } if (invalidate || !persistent) { + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 0f6dcab2b..a8833c06e 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -135,17 +135,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { return {}; } -inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode) { +inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, + Tegra::Texture::TextureMipmapFilter mip_filter_mode) { switch (filter_mode) { - case Tegra::Texture::TextureFilter::Linear: - return GL_LINEAR; - case Tegra::Texture::TextureFilter::Nearest: - return GL_NEAREST; + case Tegra::Texture::TextureFilter::Linear: { + switch (mip_filter_mode) { + case Tegra::Texture::TextureMipmapFilter::None: + return GL_LINEAR; + case Tegra::Texture::TextureMipmapFilter::Nearest: + return GL_NEAREST_MIPMAP_LINEAR; + case Tegra::Texture::TextureMipmapFilter::Linear: + return GL_LINEAR_MIPMAP_LINEAR; + } } - LOG_CRITICAL(Render_OpenGL, "Unimplemented texture filter mode={}", - static_cast<u32>(filter_mode)); - UNREACHABLE(); - return {}; + case Tegra::Texture::TextureFilter::Nearest: { + switch (mip_filter_mode) { + case Tegra::Texture::TextureMipmapFilter::None: + return GL_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Nearest: + return GL_NEAREST_MIPMAP_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Linear: + return GL_LINEAR_MIPMAP_NEAREST; + } + } + } + LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode)); + return GL_LINEAR; } inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { @@ -165,10 +180,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { return GL_CLAMP_TO_BORDER; case Tegra::Texture::WrapMode::MirrorOnceClampToEdge: return GL_MIRROR_CLAMP_TO_EDGE; + case Tegra::Texture::WrapMode::MirrorOnceBorder: + if (GL_EXT_texture_mirror_clamp) { + return GL_MIRROR_CLAMP_TO_BORDER_EXT; + } else { + return GL_MIRROR_CLAMP_TO_EDGE; + } } - LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { @@ -190,28 +210,31 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented texture depth compare function ={}", - static_cast<u32>(func)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}", + static_cast<u32>(func)); + return GL_GREATER; } inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { switch (equation) { case Maxwell::Blend::Equation::Add: + case Maxwell::Blend::Equation::AddGL: return GL_FUNC_ADD; case Maxwell::Blend::Equation::Subtract: + case Maxwell::Blend::Equation::SubtractGL: return GL_FUNC_SUBTRACT; case Maxwell::Blend::Equation::ReverseSubtract: + case Maxwell::Blend::Equation::ReverseSubtractGL: return GL_FUNC_REVERSE_SUBTRACT; case Maxwell::Blend::Equation::Min: + case Maxwell::Blend::Equation::MinGL: return GL_MIN; case Maxwell::Blend::Equation::Max: + case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation)); + return GL_FUNC_ADD; } inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { @@ -274,9 +297,8 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor)); + return GL_ZERO; } inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { @@ -295,9 +317,8 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source)); + return GL_ZERO; } inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { @@ -327,33 +348,39 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison)); + return GL_ALWAYS; } inline GLenum StencilOp(Maxwell::StencilOp stencil) { switch (stencil) { case Maxwell::StencilOp::Keep: + case Maxwell::StencilOp::KeepOGL: return GL_KEEP; case Maxwell::StencilOp::Zero: + case Maxwell::StencilOp::ZeroOGL: return GL_ZERO; case Maxwell::StencilOp::Replace: + case Maxwell::StencilOp::ReplaceOGL: return GL_REPLACE; case Maxwell::StencilOp::Incr: + case Maxwell::StencilOp::IncrOGL: return GL_INCR; case Maxwell::StencilOp::Decr: + case Maxwell::StencilOp::DecrOGL: return GL_DECR; case Maxwell::StencilOp::Invert: + case Maxwell::StencilOp::InvertOGL: return GL_INVERT; case Maxwell::StencilOp::IncrWrap: + case Maxwell::StencilOp::IncrWrapOGL: return GL_INCR_WRAP; case Maxwell::StencilOp::DecrWrap: + case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil)); + return GL_KEEP; } inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) { @@ -363,9 +390,8 @@ inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) { case Maxwell::Cull::FrontFace::CounterClockWise: return GL_CCW; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); + return GL_CCW; } inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) { @@ -377,9 +403,8 @@ inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) { case Maxwell::Cull::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); + return GL_BACK; } inline GLenum LogicOp(Maxwell::LogicOperation operation) { @@ -417,9 +442,8 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation)); - UNREACHABLE(); - return {}; + LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation)); + return GL_COPY; } } // namespace MaxwellToGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 90b68943d..235732d86 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -19,9 +19,9 @@ #include "core/settings.h" #include "core/telemetry_session.h" #include "core/tracer/recorder.h" +#include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/renderer_opengl.h" -#include "video_core/utils.h" namespace OpenGL { @@ -115,7 +115,8 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& window) RendererOpenGL::~RendererOpenGL() = default; /// Swap buffers (render frame) -void RendererOpenGL::SwapBuffers(boost::optional<const Tegra::FramebufferConfig&> framebuffer) { +void RendererOpenGL::SwapBuffers( + std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { ScopeAcquireGLContext acquire_context{render_window}; Core::System::GetInstance().GetPerfStats().EndSystemFrame(); @@ -124,11 +125,11 @@ void RendererOpenGL::SwapBuffers(boost::optional<const Tegra::FramebufferConfig& OpenGLState prev_state = OpenGLState::GetCurState(); state.Apply(); - if (framebuffer != boost::none) { + if (framebuffer) { // If framebuffer is provided, reload it from memory to a texture - if (screen_info.texture.width != (GLsizei)framebuffer->width || - screen_info.texture.height != (GLsizei)framebuffer->height || - screen_info.texture.pixel_format != framebuffer->pixel_format) { + if (screen_info.texture.width != (GLsizei)framebuffer->get().width || + screen_info.texture.height != (GLsizei)framebuffer->get().height || + screen_info.texture.pixel_format != framebuffer->get().pixel_format) { // Reallocate texture if the framebuffer size has changed. // This is expected to not happen very often and hence should not be a // performance problem. @@ -137,7 +138,12 @@ void RendererOpenGL::SwapBuffers(boost::optional<const Tegra::FramebufferConfig& // Load the framebuffer from memory, draw it to the screen, and swap buffers LoadFBToScreenInfo(*framebuffer); - DrawScreen(); + + if (renderer_settings.screenshot_requested) + CaptureScreenshot(); + + DrawScreen(render_window.GetFramebufferLayout()); + render_window.SwapBuffers(); } @@ -303,6 +309,12 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, gl_framebuffer_data.resize(texture.width * texture.height * 4); break; default: + internal_format = GL_RGBA; + texture.gl_format = GL_RGBA; + texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; + gl_framebuffer_data.resize(texture.width * texture.height * 4); + LOG_CRITICAL(Render_OpenGL, "Unknown framebuffer pixel format: {}", + static_cast<u32>(framebuffer.pixel_format)); UNREACHABLE(); } @@ -376,14 +388,13 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, /** * Draws the emulated screens to the emulator window. */ -void RendererOpenGL::DrawScreen() { +void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { if (renderer_settings.set_background_color) { // Update background color before drawing glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, 0.0f); } - const auto& layout = render_window.GetFramebufferLayout(); const auto& screen = layout.screen; glViewport(0, 0, layout.width, layout.height); @@ -407,6 +418,37 @@ void RendererOpenGL::DrawScreen() { /// Updates the framerate void RendererOpenGL::UpdateFramerate() {} +void RendererOpenGL::CaptureScreenshot() { + // Draw the current frame to the screenshot framebuffer + screenshot_framebuffer.Create(); + GLuint old_read_fb = state.draw.read_framebuffer; + GLuint old_draw_fb = state.draw.draw_framebuffer; + state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; + state.Apply(); + + Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; + + GLuint renderbuffer; + glGenRenderbuffers(1, &renderbuffer); + glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer); + glRenderbufferStorage(GL_RENDERBUFFER, GL_RGB8, layout.width, layout.height); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, renderbuffer); + + DrawScreen(layout); + + glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, + renderer_settings.screenshot_bits); + + screenshot_framebuffer.Release(); + state.draw.read_framebuffer = old_read_fb; + state.draw.draw_framebuffer = old_draw_fb; + state.Apply(); + glDeleteRenderbuffers(1, &renderbuffer); + + renderer_settings.screenshot_complete_callback(); + renderer_settings.screenshot_requested = false; +} + static const char* GetSource(GLenum source) { #define RET(s) \ case GL_DEBUG_SOURCE_##s: \ @@ -420,6 +462,7 @@ static const char* GetSource(GLenum source) { RET(OTHER); default: UNREACHABLE(); + return "Unknown source"; } #undef RET } @@ -438,6 +481,7 @@ static const char* GetType(GLenum type) { RET(MARKER); default: UNREACHABLE(); + return "Unknown type"; } #undef RET } @@ -483,7 +527,7 @@ bool RendererOpenGL::Init() { Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model); Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version); - if (!GLAD_GL_VERSION_3_3) { + if (!GLAD_GL_VERSION_4_3) { return false; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 961467a62..b85cc262f 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -16,6 +16,10 @@ namespace Core::Frontend { class EmuWindow; } +namespace Layout { +struct FramebufferLayout; +} + namespace OpenGL { /// Structure used for storing information about the textures for the Switch screen @@ -51,7 +55,8 @@ public: ~RendererOpenGL() override; /// Swap buffers (render frame) - void SwapBuffers(boost::optional<const Tegra::FramebufferConfig&> framebuffer) override; + void SwapBuffers( + std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; /// Initialize the renderer bool Init() override; @@ -65,10 +70,12 @@ private: void ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer); - void DrawScreen(); + void DrawScreen(const Layout::FramebufferLayout& layout); void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h); void UpdateFramerate(); + void CaptureScreenshot(); + // Loads framebuffer from emulated memory into the display information structure void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer); // Fills active OpenGL texture with the given RGBA color. @@ -81,6 +88,7 @@ private: OGLVertexArray vertex_array; OGLBuffer vertex_buffer; OGLProgram shader; + OGLFramebuffer screenshot_framebuffer; /// Display information for Switch screen ScreenInfo screen_info; diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp new file mode 100644 index 000000000..d84634cb3 --- /dev/null +++ b/src/video_core/renderer_opengl/utils.cpp @@ -0,0 +1,38 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <string> +#include <fmt/format.h> +#include <glad/glad.h> +#include "common/common_types.h" +#include "video_core/renderer_opengl/utils.h" + +namespace OpenGL { + +void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info) { + if (!GLAD_GL_KHR_debug) { + return; // We don't need to throw an error as this is just for debugging + } + const std::string nice_addr = fmt::format("0x{:016x}", addr); + std::string object_label; + + if (extra_info.empty()) { + switch (identifier) { + case GL_TEXTURE: + object_label = "Texture@" + nice_addr; + break; + case GL_PROGRAM: + object_label = "Shader@" + nice_addr; + break; + default: + object_label = fmt::format("Object(0x{:x})@{}", identifier, nice_addr); + break; + } + } else { + object_label = extra_info + '@' + nice_addr; + } + glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str())); +} + +} // namespace OpenGL
\ No newline at end of file diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h new file mode 100644 index 000000000..1fcb6fc11 --- /dev/null +++ b/src/video_core/renderer_opengl/utils.h @@ -0,0 +1,15 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> +#include <glad/glad.h> +#include "common/common_types.h" + +namespace OpenGL { + +void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info = ""); + +} // namespace OpenGL
\ No newline at end of file diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp new file mode 100644 index 000000000..1a344229f --- /dev/null +++ b/src/video_core/surface.cpp @@ -0,0 +1,500 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/common_types.h" +#include "common/math_util.h" +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type) { + switch (texture_type) { + case Tegra::Texture::TextureType::Texture1D: + return SurfaceTarget::Texture1D; + case Tegra::Texture::TextureType::Texture2D: + case Tegra::Texture::TextureType::Texture2DNoMipmap: + return SurfaceTarget::Texture2D; + case Tegra::Texture::TextureType::Texture3D: + return SurfaceTarget::Texture3D; + case Tegra::Texture::TextureType::TextureCubemap: + return SurfaceTarget::TextureCubemap; + case Tegra::Texture::TextureType::TextureCubeArray: + return SurfaceTarget::TextureCubeArray; + case Tegra::Texture::TextureType::Texture1DArray: + return SurfaceTarget::Texture1DArray; + case Tegra::Texture::TextureType::Texture2DArray: + return SurfaceTarget::Texture2DArray; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type)); + UNREACHABLE(); + return SurfaceTarget::Texture2D; + } +} + +bool SurfaceTargetIsLayered(SurfaceTarget target) { + switch (target) { + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture2D: + case SurfaceTarget::Texture3D: + return false; + case SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + return true; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); + UNREACHABLE(); + return false; + } +} + +PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { + switch (format) { + case Tegra::DepthFormat::S8_Z24_UNORM: + return PixelFormat::S8Z24; + case Tegra::DepthFormat::Z24_S8_UNORM: + return PixelFormat::Z24S8; + case Tegra::DepthFormat::Z32_FLOAT: + return PixelFormat::Z32F; + case Tegra::DepthFormat::Z16_UNORM: + return PixelFormat::Z16; + case Tegra::DepthFormat::Z32_S8_X24_FLOAT: + return PixelFormat::Z32FS8; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); + UNREACHABLE(); + return PixelFormat::S8Z24; + } +} + +PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) { + switch (format) { + // TODO (Hexagon12): Converting SRGBA to RGBA is a hack and doesn't completely correct the + // gamma. + case Tegra::RenderTargetFormat::RGBA8_SRGB: + return PixelFormat::RGBA8_SRGB; + case Tegra::RenderTargetFormat::RGBA8_UNORM: + return PixelFormat::ABGR8U; + case Tegra::RenderTargetFormat::RGBA8_SNORM: + return PixelFormat::ABGR8S; + case Tegra::RenderTargetFormat::RGBA8_UINT: + return PixelFormat::ABGR8UI; + case Tegra::RenderTargetFormat::BGRA8_SRGB: + return PixelFormat::BGRA8_SRGB; + case Tegra::RenderTargetFormat::BGRA8_UNORM: + return PixelFormat::BGRA8; + case Tegra::RenderTargetFormat::RGB10_A2_UNORM: + return PixelFormat::A2B10G10R10U; + case Tegra::RenderTargetFormat::RGBA16_FLOAT: + return PixelFormat::RGBA16F; + case Tegra::RenderTargetFormat::RGBA16_UNORM: + return PixelFormat::RGBA16U; + case Tegra::RenderTargetFormat::RGBA16_UINT: + return PixelFormat::RGBA16UI; + case Tegra::RenderTargetFormat::RGBA32_FLOAT: + return PixelFormat::RGBA32F; + case Tegra::RenderTargetFormat::RG32_FLOAT: + return PixelFormat::RG32F; + case Tegra::RenderTargetFormat::R11G11B10_FLOAT: + return PixelFormat::R11FG11FB10F; + case Tegra::RenderTargetFormat::B5G6R5_UNORM: + return PixelFormat::B5G6R5U; + case Tegra::RenderTargetFormat::BGR5A1_UNORM: + return PixelFormat::A1B5G5R5U; + case Tegra::RenderTargetFormat::RGBA32_UINT: + return PixelFormat::RGBA32UI; + case Tegra::RenderTargetFormat::R8_UNORM: + return PixelFormat::R8U; + case Tegra::RenderTargetFormat::R8_UINT: + return PixelFormat::R8UI; + case Tegra::RenderTargetFormat::RG16_FLOAT: + return PixelFormat::RG16F; + case Tegra::RenderTargetFormat::RG16_UINT: + return PixelFormat::RG16UI; + case Tegra::RenderTargetFormat::RG16_SINT: + return PixelFormat::RG16I; + case Tegra::RenderTargetFormat::RG16_UNORM: + return PixelFormat::RG16; + case Tegra::RenderTargetFormat::RG16_SNORM: + return PixelFormat::RG16S; + case Tegra::RenderTargetFormat::RG8_UNORM: + return PixelFormat::RG8U; + case Tegra::RenderTargetFormat::RG8_SNORM: + return PixelFormat::RG8S; + case Tegra::RenderTargetFormat::R16_FLOAT: + return PixelFormat::R16F; + case Tegra::RenderTargetFormat::R16_UNORM: + return PixelFormat::R16U; + case Tegra::RenderTargetFormat::R16_SNORM: + return PixelFormat::R16S; + case Tegra::RenderTargetFormat::R16_UINT: + return PixelFormat::R16UI; + case Tegra::RenderTargetFormat::R16_SINT: + return PixelFormat::R16I; + case Tegra::RenderTargetFormat::R32_FLOAT: + return PixelFormat::R32F; + case Tegra::RenderTargetFormat::R32_UINT: + return PixelFormat::R32UI; + case Tegra::RenderTargetFormat::RG32_UINT: + return PixelFormat::RG32UI; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); + UNREACHABLE(); + return PixelFormat::RGBA8_SRGB; + } +} + +PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, + Tegra::Texture::ComponentType component_type, + bool is_srgb) { + // TODO(Subv): Properly implement this + switch (format) { + case Tegra::Texture::TextureFormat::A8R8G8B8: + if (is_srgb) { + return PixelFormat::RGBA8_SRGB; + } + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::ABGR8U; + case Tegra::Texture::ComponentType::SNORM: + return PixelFormat::ABGR8S; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::ABGR8UI; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::B5G6R5: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::B5G6R5U; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::A2B10G10R10: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::A2B10G10R10U; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::A1B5G5R5: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::A1B5G5R5U; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R8: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::R8U; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::R8UI; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::G8R8: + // TextureFormat::G8R8 is actually ordered red then green, as such we can use + // PixelFormat::RG8U and PixelFormat::RG8S. This was tested with The Legend of Zelda: Breath + // of the Wild, which uses this format to render the hearts on the UI. + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::RG8U; + case Tegra::Texture::ComponentType::SNORM: + return PixelFormat::RG8S; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R16_G16_B16_A16: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::RGBA16U; + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::RGBA16F; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::BF10GF11RF11: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::R11FG11FB10F; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R32_G32_B32_A32: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::RGBA32F; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::RGBA32UI; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R32_G32: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::RG32F; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::RG32UI; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R32_G32_B32: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::RGB32F; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R16: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::R16F; + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::R16U; + case Tegra::Texture::ComponentType::SNORM: + return PixelFormat::R16S; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::R16UI; + case Tegra::Texture::ComponentType::SINT: + return PixelFormat::R16I; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::R32: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::R32F; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::R32UI; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::ZF32: + return PixelFormat::Z32F; + case Tegra::Texture::TextureFormat::Z16: + return PixelFormat::Z16; + case Tegra::Texture::TextureFormat::Z24S8: + return PixelFormat::Z24S8; + case Tegra::Texture::TextureFormat::DXT1: + return is_srgb ? PixelFormat::DXT1_SRGB : PixelFormat::DXT1; + case Tegra::Texture::TextureFormat::DXT23: + return is_srgb ? PixelFormat::DXT23_SRGB : PixelFormat::DXT23; + case Tegra::Texture::TextureFormat::DXT45: + return is_srgb ? PixelFormat::DXT45_SRGB : PixelFormat::DXT45; + case Tegra::Texture::TextureFormat::DXN1: + return PixelFormat::DXN1; + case Tegra::Texture::TextureFormat::DXN2: + switch (component_type) { + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::DXN2UNORM; + case Tegra::Texture::ComponentType::SNORM: + return PixelFormat::DXN2SNORM; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + case Tegra::Texture::TextureFormat::BC7U: + return is_srgb ? PixelFormat::BC7U_SRGB : PixelFormat::BC7U; + case Tegra::Texture::TextureFormat::BC6H_UF16: + return PixelFormat::BC6H_UF16; + case Tegra::Texture::TextureFormat::BC6H_SF16: + return PixelFormat::BC6H_SF16; + case Tegra::Texture::TextureFormat::ASTC_2D_4X4: + return is_srgb ? PixelFormat::ASTC_2D_4X4_SRGB : PixelFormat::ASTC_2D_4X4; + case Tegra::Texture::TextureFormat::ASTC_2D_5X4: + return is_srgb ? PixelFormat::ASTC_2D_5X4_SRGB : PixelFormat::ASTC_2D_5X4; + case Tegra::Texture::TextureFormat::ASTC_2D_5X5: + return is_srgb ? PixelFormat::ASTC_2D_5X5_SRGB : PixelFormat::ASTC_2D_5X5; + case Tegra::Texture::TextureFormat::ASTC_2D_8X8: + return is_srgb ? PixelFormat::ASTC_2D_8X8_SRGB : PixelFormat::ASTC_2D_8X8; + case Tegra::Texture::TextureFormat::ASTC_2D_8X5: + return is_srgb ? PixelFormat::ASTC_2D_8X5_SRGB : PixelFormat::ASTC_2D_8X5; + case Tegra::Texture::TextureFormat::ASTC_2D_10X8: + return is_srgb ? PixelFormat::ASTC_2D_10X8_SRGB : PixelFormat::ASTC_2D_10X8; + case Tegra::Texture::TextureFormat::R16_G16: + switch (component_type) { + case Tegra::Texture::ComponentType::FLOAT: + return PixelFormat::RG16F; + case Tegra::Texture::ComponentType::UNORM: + return PixelFormat::RG16; + case Tegra::Texture::ComponentType::SNORM: + return PixelFormat::RG16S; + case Tegra::Texture::ComponentType::UINT: + return PixelFormat::RG16UI; + case Tegra::Texture::ComponentType::SINT: + return PixelFormat::RG16I; + } + LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); + UNREACHABLE(); + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format), + static_cast<u32>(component_type)); + UNREACHABLE(); + return PixelFormat::ABGR8U; + } +} + +ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) { + // TODO(Subv): Implement more component types + switch (type) { + case Tegra::Texture::ComponentType::UNORM: + return ComponentType::UNorm; + case Tegra::Texture::ComponentType::FLOAT: + return ComponentType::Float; + case Tegra::Texture::ComponentType::SNORM: + return ComponentType::SNorm; + case Tegra::Texture::ComponentType::UINT: + return ComponentType::UInt; + case Tegra::Texture::ComponentType::SINT: + return ComponentType::SInt; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented component type={}", static_cast<u32>(type)); + UNREACHABLE(); + return ComponentType::UNorm; + } +} + +ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format) { + // TODO(Subv): Implement more render targets + switch (format) { + case Tegra::RenderTargetFormat::RGBA8_UNORM: + case Tegra::RenderTargetFormat::RGBA8_SRGB: + case Tegra::RenderTargetFormat::BGRA8_UNORM: + case Tegra::RenderTargetFormat::BGRA8_SRGB: + case Tegra::RenderTargetFormat::RGB10_A2_UNORM: + case Tegra::RenderTargetFormat::R8_UNORM: + case Tegra::RenderTargetFormat::RG16_UNORM: + case Tegra::RenderTargetFormat::R16_UNORM: + case Tegra::RenderTargetFormat::B5G6R5_UNORM: + case Tegra::RenderTargetFormat::BGR5A1_UNORM: + case Tegra::RenderTargetFormat::RG8_UNORM: + case Tegra::RenderTargetFormat::RGBA16_UNORM: + return ComponentType::UNorm; + case Tegra::RenderTargetFormat::RGBA8_SNORM: + case Tegra::RenderTargetFormat::RG16_SNORM: + case Tegra::RenderTargetFormat::R16_SNORM: + case Tegra::RenderTargetFormat::RG8_SNORM: + return ComponentType::SNorm; + case Tegra::RenderTargetFormat::RGBA16_FLOAT: + case Tegra::RenderTargetFormat::R11G11B10_FLOAT: + case Tegra::RenderTargetFormat::RGBA32_FLOAT: + case Tegra::RenderTargetFormat::RG32_FLOAT: + case Tegra::RenderTargetFormat::RG16_FLOAT: + case Tegra::RenderTargetFormat::R16_FLOAT: + case Tegra::RenderTargetFormat::R32_FLOAT: + return ComponentType::Float; + case Tegra::RenderTargetFormat::RGBA32_UINT: + case Tegra::RenderTargetFormat::RGBA16_UINT: + case Tegra::RenderTargetFormat::RG16_UINT: + case Tegra::RenderTargetFormat::R8_UINT: + case Tegra::RenderTargetFormat::R16_UINT: + case Tegra::RenderTargetFormat::RG32_UINT: + case Tegra::RenderTargetFormat::R32_UINT: + case Tegra::RenderTargetFormat::RGBA8_UINT: + return ComponentType::UInt; + case Tegra::RenderTargetFormat::RG16_SINT: + case Tegra::RenderTargetFormat::R16_SINT: + return ComponentType::SInt; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); + UNREACHABLE(); + return ComponentType::UNorm; + } +} + +PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) { + switch (format) { + case Tegra::FramebufferConfig::PixelFormat::ABGR8: + return PixelFormat::ABGR8U; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); + UNREACHABLE(); + return PixelFormat::ABGR8U; + } +} + +ComponentType ComponentTypeFromDepthFormat(Tegra::DepthFormat format) { + switch (format) { + case Tegra::DepthFormat::Z16_UNORM: + case Tegra::DepthFormat::S8_Z24_UNORM: + case Tegra::DepthFormat::Z24_S8_UNORM: + return ComponentType::UNorm; + case Tegra::DepthFormat::Z32_FLOAT: + case Tegra::DepthFormat::Z32_S8_X24_FLOAT: + return ComponentType::Float; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); + UNREACHABLE(); + return ComponentType::UNorm; + } +} + +SurfaceType GetFormatType(PixelFormat pixel_format) { + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxColorFormat)) { + return SurfaceType::ColorTexture; + } + + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) { + return SurfaceType::Depth; + } + + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) { + return SurfaceType::DepthStencil; + } + + // TODO(Subv): Implement the other formats + ASSERT(false); + + return SurfaceType::Invalid; +} + +bool IsPixelFormatASTC(PixelFormat format) { + switch (format) { + case PixelFormat::ASTC_2D_4X4: + case PixelFormat::ASTC_2D_5X4: + case PixelFormat::ASTC_2D_5X5: + case PixelFormat::ASTC_2D_8X8: + case PixelFormat::ASTC_2D_8X5: + case PixelFormat::ASTC_2D_4X4_SRGB: + case PixelFormat::ASTC_2D_5X4_SRGB: + case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_8X8_SRGB: + case PixelFormat::ASTC_2D_8X5_SRGB: + case PixelFormat::ASTC_2D_10X8: + case PixelFormat::ASTC_2D_10X8_SRGB: + return true; + default: + return false; + } +} + +std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { + return {GetDefaultBlockWidth(format), GetDefaultBlockHeight(format)}; +} + +bool IsFormatBCn(PixelFormat format) { + switch (format) { + case PixelFormat::DXT1: + case PixelFormat::DXT23: + case PixelFormat::DXT45: + case PixelFormat::DXN1: + case PixelFormat::DXN2SNORM: + case PixelFormat::DXN2UNORM: + case PixelFormat::BC7U: + case PixelFormat::BC6H_UF16: + case PixelFormat::BC6H_SF16: + case PixelFormat::DXT1_SRGB: + case PixelFormat::DXT23_SRGB: + case PixelFormat::DXT45_SRGB: + case PixelFormat::BC7U_SRGB: + return true; + } + return false; +} + +} // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h new file mode 100644 index 000000000..c2259c3c2 --- /dev/null +++ b/src/video_core/surface.h @@ -0,0 +1,469 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <climits> +#include <utility> +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "video_core/gpu.h" +#include "video_core/textures/texture.h" + +namespace VideoCore::Surface { + +enum class PixelFormat { + ABGR8U = 0, + ABGR8S = 1, + ABGR8UI = 2, + B5G6R5U = 3, + A2B10G10R10U = 4, + A1B5G5R5U = 5, + R8U = 6, + R8UI = 7, + RGBA16F = 8, + RGBA16U = 9, + RGBA16UI = 10, + R11FG11FB10F = 11, + RGBA32UI = 12, + DXT1 = 13, + DXT23 = 14, + DXT45 = 15, + DXN1 = 16, // This is also known as BC4 + DXN2UNORM = 17, + DXN2SNORM = 18, + BC7U = 19, + BC6H_UF16 = 20, + BC6H_SF16 = 21, + ASTC_2D_4X4 = 22, + BGRA8 = 23, + RGBA32F = 24, + RG32F = 25, + R32F = 26, + R16F = 27, + R16U = 28, + R16S = 29, + R16UI = 30, + R16I = 31, + RG16 = 32, + RG16F = 33, + RG16UI = 34, + RG16I = 35, + RG16S = 36, + RGB32F = 37, + RGBA8_SRGB = 38, + RG8U = 39, + RG8S = 40, + RG32UI = 41, + R32UI = 42, + ASTC_2D_8X8 = 43, + ASTC_2D_8X5 = 44, + ASTC_2D_5X4 = 45, + BGRA8_SRGB = 46, + DXT1_SRGB = 47, + DXT23_SRGB = 48, + DXT45_SRGB = 49, + BC7U_SRGB = 50, + ASTC_2D_4X4_SRGB = 51, + ASTC_2D_8X8_SRGB = 52, + ASTC_2D_8X5_SRGB = 53, + ASTC_2D_5X4_SRGB = 54, + ASTC_2D_5X5 = 55, + ASTC_2D_5X5_SRGB = 56, + ASTC_2D_10X8 = 57, + ASTC_2D_10X8_SRGB = 58, + + MaxColorFormat, + + // Depth formats + Z32F = 59, + Z16 = 60, + + MaxDepthFormat, + + // DepthStencil formats + Z24S8 = 61, + S8Z24 = 62, + Z32FS8 = 63, + + MaxDepthStencilFormat, + + Max = MaxDepthStencilFormat, + Invalid = 255, +}; + +static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); + +enum class ComponentType { + Invalid = 0, + SNorm = 1, + UNorm = 2, + SInt = 3, + UInt = 4, + Float = 5, +}; + +enum class SurfaceType { + ColorTexture = 0, + Depth = 1, + DepthStencil = 2, + Fill = 3, + Invalid = 4, +}; + +enum class SurfaceTarget { + Texture1D, + Texture2D, + Texture3D, + Texture1DArray, + Texture2DArray, + TextureCubemap, + TextureCubeArray, +}; + +constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{ + 1, // ABGR8U + 1, // ABGR8S + 1, // ABGR8UI + 1, // B5G6R5U + 1, // A2B10G10R10U + 1, // A1B5G5R5U + 1, // R8U + 1, // R8UI + 1, // RGBA16F + 1, // RGBA16U + 1, // RGBA16UI + 1, // R11FG11FB10F + 1, // RGBA32UI + 4, // DXT1 + 4, // DXT23 + 4, // DXT45 + 4, // DXN1 + 4, // DXN2UNORM + 4, // DXN2SNORM + 4, // BC7U + 4, // BC6H_UF16 + 4, // BC6H_SF16 + 4, // ASTC_2D_4X4 + 1, // BGRA8 + 1, // RGBA32F + 1, // RG32F + 1, // R32F + 1, // R16F + 1, // R16U + 1, // R16S + 1, // R16UI + 1, // R16I + 1, // RG16 + 1, // RG16F + 1, // RG16UI + 1, // RG16I + 1, // RG16S + 1, // RGB32F + 1, // RGBA8_SRGB + 1, // RG8U + 1, // RG8S + 1, // RG32UI + 1, // R32UI + 4, // ASTC_2D_8X8 + 4, // ASTC_2D_8X5 + 4, // ASTC_2D_5X4 + 1, // BGRA8_SRGB + 4, // DXT1_SRGB + 4, // DXT23_SRGB + 4, // DXT45_SRGB + 4, // BC7U_SRGB + 4, // ASTC_2D_4X4_SRGB + 4, // ASTC_2D_8X8_SRGB + 4, // ASTC_2D_8X5_SRGB + 4, // ASTC_2D_5X4_SRGB + 4, // ASTC_2D_5X5 + 4, // ASTC_2D_5X5_SRGB + 4, // ASTC_2D_10X8 + 4, // ASTC_2D_10X8_SRGB + 1, // Z32F + 1, // Z16 + 1, // Z24S8 + 1, // S8Z24 + 1, // Z32FS8 +}}; + +/** + * Gets the compression factor for the specified PixelFormat. This applies to just the + * "compressed width" and "compressed height", not the overall compression factor of a + * compressed image. This is used for maintaining proper surface sizes for compressed + * texture formats. + */ +static constexpr u32 GetCompressionFactor(PixelFormat format) { + if (format == PixelFormat::Invalid) + return 0; + + ASSERT(static_cast<std::size_t>(format) < compression_factor_table.size()); + return compression_factor_table[static_cast<std::size_t>(format)]; +} + +constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ + 1, // ABGR8U + 1, // ABGR8S + 1, // ABGR8UI + 1, // B5G6R5U + 1, // A2B10G10R10U + 1, // A1B5G5R5U + 1, // R8U + 1, // R8UI + 1, // RGBA16F + 1, // RGBA16U + 1, // RGBA16UI + 1, // R11FG11FB10F + 1, // RGBA32UI + 4, // DXT1 + 4, // DXT23 + 4, // DXT45 + 4, // DXN1 + 4, // DXN2UNORM + 4, // DXN2SNORM + 4, // BC7U + 4, // BC6H_UF16 + 4, // BC6H_SF16 + 4, // ASTC_2D_4X4 + 1, // BGRA8 + 1, // RGBA32F + 1, // RG32F + 1, // R32F + 1, // R16F + 1, // R16U + 1, // R16S + 1, // R16UI + 1, // R16I + 1, // RG16 + 1, // RG16F + 1, // RG16UI + 1, // RG16I + 1, // RG16S + 1, // RGB32F + 1, // RGBA8_SRGB + 1, // RG8U + 1, // RG8S + 1, // RG32UI + 1, // R32UI + 8, // ASTC_2D_8X8 + 8, // ASTC_2D_8X5 + 5, // ASTC_2D_5X4 + 1, // BGRA8_SRGB + 4, // DXT1_SRGB + 4, // DXT23_SRGB + 4, // DXT45_SRGB + 4, // BC7U_SRGB + 4, // ASTC_2D_4X4_SRGB + 8, // ASTC_2D_8X8_SRGB + 8, // ASTC_2D_8X5_SRGB + 5, // ASTC_2D_5X4_SRGB + 5, // ASTC_2D_5X5 + 5, // ASTC_2D_5X5_SRGB + 10, // ASTC_2D_10X8 + 10, // ASTC_2D_10X8_SRGB + 1, // Z32F + 1, // Z16 + 1, // Z24S8 + 1, // S8Z24 + 1, // Z32FS8 +}}; + +static constexpr u32 GetDefaultBlockWidth(PixelFormat format) { + if (format == PixelFormat::Invalid) + return 0; + + ASSERT(static_cast<std::size_t>(format) < block_width_table.size()); + return block_width_table[static_cast<std::size_t>(format)]; +} + +constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ + 1, // ABGR8U + 1, // ABGR8S + 1, // ABGR8UI + 1, // B5G6R5U + 1, // A2B10G10R10U + 1, // A1B5G5R5U + 1, // R8U + 1, // R8UI + 1, // RGBA16F + 1, // RGBA16U + 1, // RGBA16UI + 1, // R11FG11FB10F + 1, // RGBA32UI + 4, // DXT1 + 4, // DXT23 + 4, // DXT45 + 4, // DXN1 + 4, // DXN2UNORM + 4, // DXN2SNORM + 4, // BC7U + 4, // BC6H_UF16 + 4, // BC6H_SF16 + 4, // ASTC_2D_4X4 + 1, // BGRA8 + 1, // RGBA32F + 1, // RG32F + 1, // R32F + 1, // R16F + 1, // R16U + 1, // R16S + 1, // R16UI + 1, // R16I + 1, // RG16 + 1, // RG16F + 1, // RG16UI + 1, // RG16I + 1, // RG16S + 1, // RGB32F + 1, // RGBA8_SRGB + 1, // RG8U + 1, // RG8S + 1, // RG32UI + 1, // R32UI + 8, // ASTC_2D_8X8 + 5, // ASTC_2D_8X5 + 4, // ASTC_2D_5X4 + 1, // BGRA8_SRGB + 4, // DXT1_SRGB + 4, // DXT23_SRGB + 4, // DXT45_SRGB + 4, // BC7U_SRGB + 4, // ASTC_2D_4X4_SRGB + 8, // ASTC_2D_8X8_SRGB + 5, // ASTC_2D_8X5_SRGB + 4, // ASTC_2D_5X4_SRGB + 5, // ASTC_2D_5X5 + 5, // ASTC_2D_5X5_SRGB + 8, // ASTC_2D_10X8 + 8, // ASTC_2D_10X8_SRGB + 1, // Z32F + 1, // Z16 + 1, // Z24S8 + 1, // S8Z24 + 1, // Z32FS8 +}}; + +static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { + if (format == PixelFormat::Invalid) + return 0; + + ASSERT(static_cast<std::size_t>(format) < block_height_table.size()); + return block_height_table[static_cast<std::size_t>(format)]; +} + +constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ + 32, // ABGR8U + 32, // ABGR8S + 32, // ABGR8UI + 16, // B5G6R5U + 32, // A2B10G10R10U + 16, // A1B5G5R5U + 8, // R8U + 8, // R8UI + 64, // RGBA16F + 64, // RGBA16U + 64, // RGBA16UI + 32, // R11FG11FB10F + 128, // RGBA32UI + 64, // DXT1 + 128, // DXT23 + 128, // DXT45 + 64, // DXN1 + 128, // DXN2UNORM + 128, // DXN2SNORM + 128, // BC7U + 128, // BC6H_UF16 + 128, // BC6H_SF16 + 128, // ASTC_2D_4X4 + 32, // BGRA8 + 128, // RGBA32F + 64, // RG32F + 32, // R32F + 16, // R16F + 16, // R16U + 16, // R16S + 16, // R16UI + 16, // R16I + 32, // RG16 + 32, // RG16F + 32, // RG16UI + 32, // RG16I + 32, // RG16S + 96, // RGB32F + 32, // RGBA8_SRGB + 16, // RG8U + 16, // RG8S + 64, // RG32UI + 32, // R32UI + 128, // ASTC_2D_8X8 + 128, // ASTC_2D_8X5 + 128, // ASTC_2D_5X4 + 32, // BGRA8_SRGB + 64, // DXT1_SRGB + 128, // DXT23_SRGB + 128, // DXT45_SRGB + 128, // BC7U + 128, // ASTC_2D_4X4_SRGB + 128, // ASTC_2D_8X8_SRGB + 128, // ASTC_2D_8X5_SRGB + 128, // ASTC_2D_5X4_SRGB + 128, // ASTC_2D_5X5 + 128, // ASTC_2D_5X5_SRGB + 128, // ASTC_2D_10X8 + 128, // ASTC_2D_10X8_SRGB + 32, // Z32F + 16, // Z16 + 32, // Z24S8 + 32, // S8Z24 + 64, // Z32FS8 +}}; + +static constexpr u32 GetFormatBpp(PixelFormat format) { + if (format == PixelFormat::Invalid) + return 0; + + ASSERT(static_cast<std::size_t>(format) < bpp_table.size()); + return bpp_table[static_cast<std::size_t>(format)]; +} + +/// Returns the sizer in bytes of the specified pixel format +static constexpr u32 GetBytesPerPixel(PixelFormat pixel_format) { + if (pixel_format == PixelFormat::Invalid) { + return 0; + } + return GetFormatBpp(pixel_format) / CHAR_BIT; +} + +SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type); + +bool SurfaceTargetIsLayered(SurfaceTarget target); + +PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format); + +PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format); + +PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, + Tegra::Texture::ComponentType component_type, + bool is_srgb); + +ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type); + +ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format); + +PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format); + +ComponentType ComponentTypeFromDepthFormat(Tegra::DepthFormat format); + +SurfaceType GetFormatType(PixelFormat pixel_format); + +bool IsPixelFormatASTC(PixelFormat format); + +std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); + +/// Returns true if the specified PixelFormat is a BCn format, e.g. DXT or DXN +bool IsFormatBCn(PixelFormat format); + +} // namespace VideoCore::Surface diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index b1feacae9..bc50a4876 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -1598,27 +1598,29 @@ static void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth, namespace Tegra::Texture::ASTC { std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height, - uint32_t block_width, uint32_t block_height) { + uint32_t depth, uint32_t block_width, uint32_t block_height) { uint32_t blockIdx = 0; - std::vector<uint8_t> outData(height * width * 4); - for (uint32_t j = 0; j < height; j += block_height) { - for (uint32_t i = 0; i < width; i += block_width) { + std::vector<uint8_t> outData(height * width * depth * 4); + for (uint32_t k = 0; k < depth; k++) { + for (uint32_t j = 0; j < height; j += block_height) { + for (uint32_t i = 0; i < width; i += block_width) { - uint8_t* blockPtr = data.data() + blockIdx * 16; + uint8_t* blockPtr = data.data() + blockIdx * 16; - // Blocks can be at most 12x12 - uint32_t uncompData[144]; - ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); + // Blocks can be at most 12x12 + uint32_t uncompData[144]; + ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - uint32_t decompWidth = std::min(block_width, width - i); - uint32_t decompHeight = std::min(block_height, height - j); + uint32_t decompWidth = std::min(block_width, width - i); + uint32_t decompHeight = std::min(block_height, height - j); - uint8_t* outRow = outData.data() + (j * width + i) * 4; - for (uint32_t jj = 0; jj < decompHeight; jj++) { - memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); - } + uint8_t* outRow = outData.data() + (j * width + i) * 4; + for (uint32_t jj = 0; jj < decompHeight; jj++) { + memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); + } - blockIdx++; + blockIdx++; + } } } diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index f0d7c0e56..d419dd025 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -10,6 +10,6 @@ namespace Tegra::Texture::ASTC { std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height, - uint32_t block_width, uint32_t block_height); + uint32_t depth, uint32_t block_width, uint32_t block_height); } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 550ca856c..5db75de22 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -37,25 +37,28 @@ struct alignas(64) SwizzleTable { std::array<std::array<u16, M>, N> values{}; }; -constexpr auto legacy_swizzle_table = SwizzleTable<8, 64, 1>(); -constexpr auto fast_swizzle_table = SwizzleTable<8, 4, 16>(); +constexpr u32 gob_size_x = 64; +constexpr u32 gob_size_y = 8; +constexpr u32 gob_size_z = 1; +constexpr u32 gob_size = gob_size_x * gob_size_y * gob_size_z; +constexpr u32 fast_swizzle_align = 16; + +constexpr auto legacy_swizzle_table = SwizzleTable<gob_size_y, gob_size_x, gob_size_z>(); +constexpr auto fast_swizzle_table = SwizzleTable<gob_size_y, 4, fast_swizzle_align>(); /** * This function manages ALL the GOBs(Group of Bytes) Inside a single block. * Instead of going gob by gob, we map the coordinates inside a block and manage from * those. Block_Width is assumed to be 1. */ -void PreciseProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, +void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end, const u32 z_end, const u32 tile_offset, const u32 xy_block_size, const u32 layer_z, const u32 stride_x, const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { std::array<u8*, 2> data_ptrs; u32 z_address = tile_offset; - const u32 gob_size_x = 64; - const u32 gob_size_y = 8; - const u32 gob_size_z = 1; - const u32 gob_size = gob_size_x * gob_size_y * gob_size_z; + for (u32 z = z_start; z < z_end; z++) { u32 y_address = z_address; u32 pixel_base = layer_z * z + y_start * stride_x; @@ -81,7 +84,7 @@ void PreciseProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unsw * Instead of going gob by gob, we map the coordinates inside a block and manage from * those. Block_Width is assumed to be 1. */ -void FastProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, +void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, const u32 y_end, const u32 z_end, const u32 tile_offset, const u32 xy_block_size, const u32 layer_z, const u32 stride_x, @@ -90,23 +93,19 @@ void FastProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizz u32 z_address = tile_offset; const u32 x_startb = x_start * bytes_per_pixel; const u32 x_endb = x_end * bytes_per_pixel; - const u32 copy_size = 16; - const u32 gob_size_x = 64; - const u32 gob_size_y = 8; - const u32 gob_size_z = 1; - const u32 gob_size = gob_size_x * gob_size_y * gob_size_z; + for (u32 z = z_start; z < z_end; z++) { u32 y_address = z_address; u32 pixel_base = layer_z * z + y_start * stride_x; for (u32 y = y_start; y < y_end; y++) { const auto& table = fast_swizzle_table[y % gob_size_y]; - for (u32 xb = x_startb; xb < x_endb; xb += copy_size) { - const u32 swizzle_offset{y_address + table[(xb / copy_size) % 4]}; + for (u32 xb = x_startb; xb < x_endb; xb += fast_swizzle_align) { + const u32 swizzle_offset{y_address + table[(xb / fast_swizzle_align) % 4]}; const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; const u32 pixel_index{out_x + pixel_base}; data_ptrs[unswizzle] = swizzled_data + swizzle_offset; data_ptrs[!unswizzle] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], copy_size); + std::memcpy(data_ptrs[0], data_ptrs[1], fast_swizzle_align); } pixel_base += stride_x; if ((y + 1) % gob_size_y == 0) @@ -126,23 +125,23 @@ void FastProcessBlock(u8* swizzled_data, u8* unswizzled_data, const bool unswizz * https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces */ template <bool fast> -void SwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, const u32 width, - const u32 height, const u32 depth, const u32 bytes_per_pixel, - const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth) { +void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, + const u32 width, const u32 height, const u32 depth, const u32 bytes_per_pixel, + const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth, + const u32 width_spacing) { auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; const u32 stride_x = width * out_bytes_per_pixel; const u32 layer_z = height * stride_x; - const u32 gob_x_bytes = 64; - const u32 gob_elements_x = gob_x_bytes / bytes_per_pixel; - const u32 gob_elements_y = 8; - const u32 gob_elements_z = 1; + const u32 gob_elements_x = gob_size_x / bytes_per_pixel; + constexpr u32 gob_elements_y = gob_size_y; + constexpr u32 gob_elements_z = gob_size_z; const u32 block_x_elements = gob_elements_x; const u32 block_y_elements = gob_elements_y * block_height; const u32 block_z_elements = gob_elements_z * block_depth; - const u32 blocks_on_x = div_ceil(width, block_x_elements); + const u32 aligned_width = Common::AlignUp(width, gob_elements_x * width_spacing); + const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements); const u32 blocks_on_y = div_ceil(height, block_y_elements); const u32 blocks_on_z = div_ceil(depth, block_z_elements); - const u32 gob_size = gob_x_bytes * gob_elements_y * gob_elements_z; const u32 xy_block_size = gob_size * block_height; const u32 block_size = xy_block_size * block_depth; u32 tile_offset = 0; @@ -171,14 +170,16 @@ void SwizzledData(u8* swizzled_data, u8* unswizzled_data, const bool unswizzle, } void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, - u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth) { - if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % 16 == 0) { + u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data, + bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) { + if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % fast_swizzle_align == 0) { SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height, block_depth); + bytes_per_pixel, out_bytes_per_pixel, block_height, block_depth, + width_spacing); } else { SwizzledData<false>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height, block_depth); + bytes_per_pixel, out_bytes_per_pixel, block_height, block_depth, + width_spacing); } } @@ -202,6 +203,8 @@ u32 BytesPerPixel(TextureFormat format) { case TextureFormat::ASTC_2D_5X4: case TextureFormat::ASTC_2D_8X8: case TextureFormat::ASTC_2D_8X5: + case TextureFormat::ASTC_2D_10X8: + case TextureFormat::ASTC_2D_5X5: case TextureFormat::A8R8G8B8: case TextureFormat::A2B10G10R10: case TextureFormat::BF10GF11RF11: @@ -223,31 +226,42 @@ u32 BytesPerPixel(TextureFormat format) { return 8; default: UNIMPLEMENTED_MSG("Format not implemented"); - break; + return 1; } } -std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width, - u32 height, u32 depth, u32 block_height, u32 block_depth) { +void UnswizzleTexture(u8* const unswizzled_data, VAddr address, u32 tile_size_x, u32 tile_size_y, + u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, + u32 block_depth, u32 width_spacing) { + CopySwizzledData((width + tile_size_x - 1) / tile_size_x, + (height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel, + bytes_per_pixel, Memory::GetPointer(address), unswizzled_data, true, + block_height, block_depth, width_spacing); +} + +std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y, + u32 bytes_per_pixel, u32 width, u32 height, u32 depth, + u32 block_height, u32 block_depth, u32 width_spacing) { std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel); - CopySwizzledData(width / tile_size, height / tile_size, depth, bytes_per_pixel, bytes_per_pixel, - Memory::GetPointer(address), unswizzled_data.data(), true, block_height, - block_depth); + UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel, + width, height, depth, block_height, block_depth, width_spacing); return unswizzled_data; } void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data, u32 block_height) { - const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + 63) / 64}; + const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) / + gob_size_x}; for (u32 line = 0; line < subrect_height; ++line) { const u32 gob_address_y = - (line / (8 * block_height)) * 512 * block_height * image_width_in_gobs + - (line % (8 * block_height) / 8) * 512; - const auto& table = legacy_swizzle_table[line % 8]; + (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + + ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size; + const auto& table = legacy_swizzle_table[line % gob_size_y]; for (u32 x = 0; x < subrect_width; ++x) { - const u32 gob_address = gob_address_y + (x * bytes_per_pixel / 64) * 512 * block_height; - const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % 64]; + const u32 gob_address = + gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height; + const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x]; const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; const VAddr dest_addr = swizzled_data + swizzled_offset; @@ -261,13 +275,13 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 u32 block_height, u32 offset_x, u32 offset_y) { for (u32 line = 0; line < subrect_height; ++line) { const u32 y2 = line + offset_y; - const u32 gob_address_y = - (y2 / (8 * block_height)) * 512 * block_height + (y2 % (8 * block_height) / 8) * 512; - const auto& table = legacy_swizzle_table[y2 % 8]; + const u32 gob_address_y = (y2 / (gob_size_y * block_height)) * gob_size * block_height + + ((y2 % (gob_size_y * block_height)) / gob_size_y) * gob_size; + const auto& table = legacy_swizzle_table[y2 % gob_size_y]; for (u32 x = 0; x < subrect_width; ++x) { const u32 x2 = (x + offset_x) * bytes_per_pixel; - const u32 gob_address = gob_address_y + (x2 / 64) * 512 * block_height; - const u32 swizzled_offset = gob_address + table[x2 % 64]; + const u32 gob_address = gob_address_y + (x2 / gob_size_x) * gob_size * block_height; + const u32 swizzled_offset = gob_address + table[x2 % gob_size_x]; const VAddr dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel; const VAddr source_addr = swizzled_data + swizzled_offset; @@ -292,6 +306,8 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat case TextureFormat::BC6H_SF16: case TextureFormat::ASTC_2D_4X4: case TextureFormat::ASTC_2D_8X8: + case TextureFormat::ASTC_2D_5X5: + case TextureFormat::ASTC_2D_10X8: case TextureFormat::A8R8G8B8: case TextureFormat::A2B10G10R10: case TextureFormat::A1B5G5R5: @@ -319,12 +335,9 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth) { if (tiled) { - const u32 gobs_in_x = 64; - const u32 gobs_in_y = 8; - const u32 gobs_in_z = 1; - const u32 aligned_width = Common::AlignUp(width * bytes_per_pixel, gobs_in_x); - const u32 aligned_height = Common::AlignUp(height, gobs_in_y * block_height); - const u32 aligned_depth = Common::AlignUp(depth, gobs_in_z * block_depth); + const u32 aligned_width = Common::AlignUp(width * bytes_per_pixel, gob_size_x); + const u32 aligned_height = Common::AlignUp(height, gob_size_y * block_height); + const u32 aligned_depth = Common::AlignUp(depth, gob_size_z * block_depth); return aligned_width * aligned_height * aligned_depth; } else { return width * height * depth * bytes_per_pixel; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 4726f54a5..85b7e9f7b 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -10,18 +10,32 @@ namespace Tegra::Texture { +// GOBSize constant. Calculated by 64 bytes in x multiplied by 8 y coords, represents +// an small rect of (64/bytes_per_pixel)X8. +inline std::size_t GetGOBSize() { + return 512; +} + +/** + * Unswizzles a swizzled texture without changing its format. + */ +void UnswizzleTexture(u8* unswizzled_data, VAddr address, u32 tile_size_x, u32 tile_size_y, + u32 bytes_per_pixel, u32 width, u32 height, u32 depth, + u32 block_height = TICEntry::DefaultBlockHeight, + u32 block_depth = TICEntry::DefaultBlockHeight, u32 width_spacing = 0); /** * Unswizzles a swizzled texture without changing its format. */ -std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width, - u32 height, u32 depth, +std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y, + u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height = TICEntry::DefaultBlockHeight, - u32 block_depth = TICEntry::DefaultBlockHeight); + u32 block_depth = TICEntry::DefaultBlockHeight, + u32 width_spacing = 0); /// Copies texture data from a buffer and performs swizzling/unswizzling as necessary. void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth); + bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing); /** * Decodes an unswizzled texture into a A8R8G8B8 texture. diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index d12d2ecb8..e7c78bee2 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -166,22 +166,34 @@ struct TICEntry { BitField<3, 3, u32> block_height; BitField<6, 3, u32> block_depth; + BitField<10, 3, u32> tile_width_spacing; + // High 16 bits of the pitch value BitField<0, 16, u32> pitch_high; - + BitField<26, 1, u32> use_header_opt_control; + BitField<27, 1, u32> depth_texture; BitField<28, 4, u32> max_mip_level; }; union { BitField<0, 16, u32> width_minus_1; BitField<22, 1, u32> srgb_conversion; BitField<23, 4, TextureType> texture_type; + BitField<29, 3, u32> border_size; }; union { BitField<0, 16, u32> height_minus_1; BitField<16, 15, u32> depth_minus_1; }; + union { + BitField<6, 13, u32> mip_lod_bias; + BitField<27, 3, u32> max_anisotropy; + }; - INSERT_PADDING_BYTES(8); + union { + BitField<0, 4, u32> res_min_mip_level; + BitField<4, 4, u32> res_max_mip_level; + BitField<12, 12, u32> min_lod_clamp; + }; GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); @@ -275,13 +287,25 @@ struct TSCEntry { BitField<6, 3, WrapMode> wrap_p; BitField<9, 1, u32> depth_compare_enabled; BitField<10, 3, DepthCompareFunc> depth_compare_func; + BitField<13, 1, u32> srgb_conversion; + BitField<20, 3, u32> max_anisotropy; }; union { BitField<0, 2, TextureFilter> mag_filter; BitField<4, 2, TextureFilter> min_filter; BitField<6, 2, TextureMipmapFilter> mip_filter; + BitField<9, 1, u32> cubemap_interface_filtering; + BitField<12, 13, u32> mip_lod_bias; + }; + union { + BitField<0, 12, u32> min_lod_clamp; + BitField<12, 12, u32> max_lod_clamp; + BitField<24, 8, u32> srgb_border_color_r; + }; + union { + BitField<12, 8, u32> srgb_border_color_g; + BitField<20, 8, u32> srgb_border_color_b; }; - INSERT_PADDING_BYTES(8); float border_color_r; float border_color_g; float border_color_b; diff --git a/src/video_core/utils.h b/src/video_core/utils.h deleted file mode 100644 index 237cc1307..000000000 --- a/src/video_core/utils.h +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCore { - -// 8x8 Z-Order coordinate from 2D coordinates -static inline u32 MortonInterleave(u32 x, u32 y) { - static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15}; - static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a}; - return xlut[x % 8] + ylut[y % 8]; -} - -/** - * Calculates the offset of the position of the pixel in Morton order - */ -static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) { - // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each - // of which is composed of four 2x2 subtiles each of which is composed of four texels. - // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g. - // texels are laid out in a 2x2 subtile like this: - // 2 3 - // 0 1 - // - // The full 8x8 tile has the texels arranged like this: - // - // 42 43 46 47 58 59 62 63 - // 40 41 44 45 56 57 60 61 - // 34 35 38 39 50 51 54 55 - // 32 33 36 37 48 49 52 53 - // 10 11 14 15 26 27 30 31 - // 08 09 12 13 24 25 28 29 - // 02 03 06 07 18 19 22 23 - // 00 01 04 05 16 17 20 21 - // - // This pattern is what's called Z-order curve, or Morton order. - - const unsigned int block_height = 8; - const unsigned int coarse_x = x & ~7; - - u32 i = VideoCore::MortonInterleave(x, y); - - const unsigned int offset = coarse_x * block_height; - - return (i + offset) * bytes_per_pixel; -} - -static inline u32 MortonInterleave128(u32 x, u32 y) { - // 128x128 Z-Order coordinate from 2D coordinates - static constexpr u32 xlut[] = { - 0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042, - 0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809, - 0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000, - 0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043, - 0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a, - 0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001, - 0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, - 0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b, - 0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002, - 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049, - 0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840, - 0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003, - 0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a, - 0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841, - 0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008, - 0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b, - 0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842, - 0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009, - 0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800, - 0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843, - 0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, - 0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801, - 0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848, - 0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b, - 0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802, - 0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849, - 0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040, - 0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803, - 0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a, - 0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041, - 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808, - 0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b, - 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042, - 0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809, - 0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b, - }; - static constexpr u32 ylut[] = { - 0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090, - 0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124, - 0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200, - 0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294, - 0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330, - 0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404, - 0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0, - 0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534, - 0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610, - 0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4, - 0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780, - 0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014, - 0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0, - 0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184, - 0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220, - 0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4, - 0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390, - 0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424, - 0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500, - 0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594, - 0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630, - 0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704, - 0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0, - 0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034, - 0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110, - 0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4, - 0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280, - 0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314, - 0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0, - 0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484, - 0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520, - 0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4, - 0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690, - 0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724, - 0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4, - }; - return xlut[x % 128] + ylut[y % 128]; -} - -static inline u32 GetMortonOffset128(u32 x, u32 y, u32 bytes_per_pixel) { - // Calculates the offset of the position of the pixel in Morton order - // Framebuffer images are split into 128x128 tiles. - - const unsigned int block_height = 128; - const unsigned int coarse_x = x & ~127; - - u32 i = MortonInterleave128(x, y); - - const unsigned int offset = coarse_x * block_height; - - return (i + offset) * bytes_per_pixel; -} - -static inline void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixel, - u32 gl_bytes_per_pixel, u8* morton_data, u8* gl_data, - bool morton_to_gl) { - u8* data_ptrs[2]; - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~127; - u32 morton_offset = - GetMortonOffset128(x, y, bytes_per_pixel) + coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + y * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); - } - } -} - -static void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, - std::string extra_info = "") { - if (!GLAD_GL_KHR_debug) { - return; // We don't need to throw an error as this is just for debugging - } - const std::string nice_addr = fmt::format("0x{:016x}", addr); - std::string object_label; - - if (extra_info.empty()) { - switch (identifier) { - case GL_TEXTURE: - object_label = "Texture@" + nice_addr; - break; - case GL_PROGRAM: - object_label = "Shader@" + nice_addr; - break; - default: - object_label = fmt::format("Object(0x{:x})@{}", identifier, nice_addr); - break; - } - } else { - object_label = extra_info + '@' + nice_addr; - } - glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str())); -} - -} // namespace VideoCore diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 07e3a7d24..f7de3471b 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -3,6 +3,8 @@ // Refer to the license.txt file included. #include <memory> +#include "core/core.h" +#include "core/settings.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/renderer_opengl.h" #include "video_core/video_core.h" @@ -13,4 +15,10 @@ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_wind return std::make_unique<OpenGL::RendererOpenGL>(emu_window); } +u16 GetResolutionScaleFactor(const RendererBase& renderer) { + return !Settings::values.resolution_factor + ? renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio() + : Settings::values.resolution_factor; +} + } // namespace VideoCore diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index f79f85dfe..5b373bcb1 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -22,4 +22,6 @@ class RendererBase; */ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window); +u16 GetResolutionScaleFactor(const RendererBase& renderer); + } // namespace VideoCore |
