diff options
Diffstat (limited to 'src/video_core')
37 files changed, 2379 insertions, 962 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index aa5bc3bbe..f5ae57039 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -5,6 +5,8 @@ add_library(video_core STATIC debug_utils/debug_utils.h engines/fermi_2d.cpp engines/fermi_2d.h + engines/kepler_memory.cpp + engines/kepler_memory.h engines/maxwell_3d.cpp engines/maxwell_3d.h engines/maxwell_compute.cpp @@ -12,6 +14,7 @@ add_library(video_core STATIC engines/maxwell_dma.cpp engines/maxwell_dma.h engines/shader_bytecode.h + engines/shader_header.h gpu.cpp gpu.h macro_interpreter.cpp @@ -22,6 +25,8 @@ add_library(video_core STATIC rasterizer_interface.h renderer_base.cpp renderer_base.h + renderer_opengl/gl_buffer_cache.cpp + renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer.h renderer_opengl/gl_rasterizer_cache.cpp diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index dc485e811..f1aa6091b 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -14,6 +14,7 @@ #include "core/tracer/recorder.h" #include "video_core/command_processor.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_compute.h" #include "video_core/engines/maxwell_dma.h" @@ -28,98 +29,109 @@ enum class BufferMethods { CountBufferMethods = 0x40, }; -void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) { - LOG_TRACE(HW_GPU, - "Processing method {:08X} on subchannel {} value " - "{:08X} remaining params {}", - method, subchannel, value, remaining_params); - - if (method == static_cast<u32>(BufferMethods::BindObject)) { - // Bind the current subchannel to the desired engine id. - LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); - bound_engines[subchannel] = static_cast<EngineID>(value); - return; - } +MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); - if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) { - // TODO(Subv): Research and implement these methods. - LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); - return; - } +void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) { + MICROPROFILE_SCOPE(ProcessCommandLists); - ASSERT(bound_engines.find(subchannel) != bound_engines.end()); - - const EngineID engine = bound_engines[subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->WriteReg(method, value); - break; - case EngineID::MAXWELL_B: - maxwell_3d->WriteReg(method, value, remaining_params); - break; - case EngineID::MAXWELL_COMPUTE_B: - maxwell_compute->WriteReg(method, value); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->WriteReg(method, value); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} + auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) { + LOG_TRACE(HW_GPU, + "Processing method {:08X} on subchannel {} value " + "{:08X} remaining params {}", + method, subchannel, value, remaining_params); -void GPU::ProcessCommandList(GPUVAddr address, u32 size) { - const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address); - VAddr current_addr = *head_address; - while (current_addr < *head_address + size * sizeof(CommandHeader)) { - const CommandHeader header = {Memory::Read32(current_addr)}; - current_addr += sizeof(u32); - - switch (header.mode.Value()) { - case SubmissionMode::IncreasingOld: - case SubmissionMode::Increasing: { - // Increase the method value with each argument. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } - break; + ASSERT(subchannel < bound_engines.size()); + + if (method == static_cast<u32>(BufferMethods::BindObject)) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); + bound_engines[subchannel] = static_cast<EngineID>(value); + return; } - case SubmissionMode::NonIncreasingOld: - case SubmissionMode::NonIncreasing: { - // Use the same method value for all arguments. - for (unsigned i = 0; i < header.arg_count; ++i) { - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); - current_addr += sizeof(u32); - } + + if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) { + // TODO(Subv): Research and implement these methods. + LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); + return; + } + + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->WriteReg(method, value); + break; + case EngineID::MAXWELL_B: + maxwell_3d->WriteReg(method, value, remaining_params); break; + case EngineID::MAXWELL_COMPUTE_B: + maxwell_compute->WriteReg(method, value); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->WriteReg(method, value); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->WriteReg(method, value); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); } - case SubmissionMode::IncreaseOnce: { - ASSERT(header.arg_count.Value() >= 1); + }; - // Use the original method for the first argument and then the next method for all other - // arguments. - WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), - header.arg_count - 1); + for (auto entry : commands) { + Tegra::GPUVAddr address = entry.Address(); + u32 size = entry.sz; + const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address); + VAddr current_addr = *head_address; + while (current_addr < *head_address + size * sizeof(CommandHeader)) { + const CommandHeader header = {Memory::Read32(current_addr)}; current_addr += sizeof(u32); - for (unsigned i = 1; i < header.arg_count; ++i) { - WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), - header.arg_count - i - 1); + switch (header.mode.Value()) { + case SubmissionMode::IncreasingOld: + case SubmissionMode::Increasing: { + // Increase the method value with each argument. + for (unsigned i = 0; i < header.arg_count; ++i) { + WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::NonIncreasingOld: + case SubmissionMode::NonIncreasing: { + // Use the same method value for all arguments. + for (unsigned i = 0; i < header.arg_count; ++i) { + WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::IncreaseOnce: { + ASSERT(header.arg_count.Value() >= 1); + + // Use the original method for the first argument and then the next method for all + // other arguments. + WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), + header.arg_count - 1); current_addr += sizeof(u32); + + for (unsigned i = 1; i < header.arg_count; ++i) { + WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), + header.arg_count - i - 1); + current_addr += sizeof(u32); + } + break; + } + case SubmissionMode::Inline: { + // The register value is stored in the bits 16-28 as an immediate + WriteReg(header.method, header.subchannel, header.inline_data, 0); + break; + } + default: + UNIMPLEMENTED(); } - break; - } - case SubmissionMode::Inline: { - // The register value is stored in the bits 16-28 as an immediate - WriteReg(header.method, header.subchannel, header.inline_data, 0); - break; - } - default: - UNIMPLEMENTED(); } } } diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h index a01153e0b..bd766e77a 100644 --- a/src/video_core/command_processor.h +++ b/src/video_core/command_processor.h @@ -7,6 +7,7 @@ #include <type_traits> #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/memory_manager.h" namespace Tegra { @@ -19,6 +20,22 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +struct CommandListHeader { + u32 entry0; // gpu_va_lo + union { + u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) + BitField<0, 8, u32> gpu_va_hi; + BitField<8, 2, u32> unk1; + BitField<10, 21, u32> sz; + BitField<31, 1, u32> unk2; + }; + + GPUVAddr Address() const { + return (static_cast<GPUVAddr>(gpu_va_hi) << 32) | entry0; + } +}; +static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size"); + union CommandHeader { u32 hex; diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index dcf9ef8b9..021b83eaa 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -26,7 +26,7 @@ public: void WriteReg(u32 method, u32 value); struct Regs { - static constexpr size_t NUM_REGS = 0x258; + static constexpr std::size_t NUM_REGS = 0x258; struct Surface { RenderTargetFormat format; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp new file mode 100644 index 000000000..66ae6332d --- /dev/null +++ b/src/video_core/engines/kepler_memory.cpp @@ -0,0 +1,45 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/logging/log.h" +#include "core/memory.h" +#include "video_core/engines/kepler_memory.h" + +namespace Tegra::Engines { + +KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {} +KeplerMemory::~KeplerMemory() = default; + +void KeplerMemory::WriteReg(u32 method, u32 value) { + ASSERT_MSG(method < Regs::NUM_REGS, + "Invalid KeplerMemory register, increase the size of the Regs structure"); + + regs.reg_array[method] = value; + + switch (method) { + case KEPLERMEMORY_REG_INDEX(exec): { + state.write_offset = 0; + break; + } + case KEPLERMEMORY_REG_INDEX(data): { + ProcessData(value); + break; + } + } +} + +void KeplerMemory::ProcessData(u32 data) { + ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported"); + ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0); + + GPUVAddr address = regs.dest.Address(); + VAddr dest_address = + *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32)); + + Memory::Write32(dest_address, data); + + state.write_offset++; +} + +} // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h new file mode 100644 index 000000000..b0d0078cf --- /dev/null +++ b/src/video_core/engines/kepler_memory.h @@ -0,0 +1,90 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Engines { + +#define KEPLERMEMORY_REG_INDEX(field_name) \ + (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32)) + +class KeplerMemory final { +public: + KeplerMemory(MemoryManager& memory_manager); + ~KeplerMemory(); + + /// Write the value to the register identified by method. + void WriteReg(u32 method, u32 value); + + struct Regs { + static constexpr size_t NUM_REGS = 0x7F; + + union { + struct { + INSERT_PADDING_WORDS(0x60); + + u32 line_length_in; + u32 line_count; + + struct { + u32 address_high; + u32 address_low; + u32 pitch; + u32 block_dimensions; + u32 width; + u32 height; + u32 depth; + u32 z; + u32 x; + u32 y; + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } dest; + + struct { + union { + BitField<0, 1, u32> linear; + }; + } exec; + + u32 data; + + INSERT_PADDING_WORDS(0x11); + }; + std::array<u32, NUM_REGS> reg_array; + }; + } regs{}; + + struct { + u32 write_offset = 0; + } state{}; + +private: + MemoryManager& memory_manager; + + void ProcessData(u32 data); +}; + +#define ASSERT_REG_POSITION(field_name, position) \ + static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4, \ + "Field " #field_name " has invalid position") + +ASSERT_REG_POSITION(line_length_in, 0x60); +ASSERT_REG_POSITION(line_count, 0x61); +ASSERT_REG_POSITION(dest, 0x62); +ASSERT_REG_POSITION(exec, 0x6C); +ASSERT_REG_POSITION(data, 0x6D); +#undef ASSERT_REG_POSITION + +} // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 68ff1e86b..8afd26fe9 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -5,6 +5,7 @@ #include <cinttypes> #include "common/assert.h" #include "core/core.h" +#include "core/core_timing.h" #include "core/memory.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/engines/maxwell_3d.h" @@ -134,8 +135,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { break; } - rasterizer.NotifyMaxwellRegisterChanged(method); - if (debug_context) { debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr); } @@ -194,8 +193,8 @@ void Maxwell3D::ProcessQueryGet() { // wait queues. LongQueryResult query_result{}; query_result.value = result; - // TODO(Subv): Generate a real GPU timestamp and write it here instead of 0 - query_result.timestamp = 0; + // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming + query_result.timestamp = CoreTiming::GetTicks(); Memory::WriteBlock(*address, &query_result, sizeof(query_result)); } break; @@ -249,8 +248,8 @@ void Maxwell3D::DrawArrays() { void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. - auto& shader = state.shader_stages[static_cast<size_t>(stage)]; - auto& bind_data = regs.cb_bind[static_cast<size_t>(stage)]; + auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; + auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)]; auto& buffer = shader.const_buffers[bind_data.index]; @@ -292,10 +291,6 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { tic_entry.header_version == Texture::TICHeaderVersion::Pitch, "TIC versions other than BlockLinear or Pitch are unimplemented"); - ASSERT_MSG((tic_entry.texture_type == Texture::TextureType::Texture2D) || - (tic_entry.texture_type == Texture::TextureType::Texture2DNoMipmap), - "Texture types other than Texture2D are unimplemented"); - auto r_type = tic_entry.r_type.Value(); auto g_type = tic_entry.g_type.Value(); auto b_type = tic_entry.b_type.Value(); @@ -321,14 +316,14 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderStage stage) const { std::vector<Texture::FullTextureInfo> textures; - auto& fragment_shader = state.shader_stages[static_cast<size_t>(stage)]; + auto& fragment_shader = state.shader_stages[static_cast<std::size_t>(stage)]; auto& tex_info_buffer = fragment_shader.const_buffers[regs.tex_cb_index]; ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0); GPUVAddr tex_info_buffer_end = tex_info_buffer.address + tex_info_buffer.size; // Offset into the texture constbuffer where the texture info begins. - static constexpr size_t TextureInfoOffset = 0x20; + static constexpr std::size_t TextureInfoOffset = 0x20; for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset; current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) { @@ -365,8 +360,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt return textures; } -Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, size_t offset) const { - auto& shader = state.shader_stages[static_cast<size_t>(stage)]; +Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, + std::size_t offset) const { + auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index]; ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 92bfda053..b81b0723d 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -34,17 +34,17 @@ public: /// Register structure of the Maxwell3D engine. /// TODO(Subv): This structure will need to be made bigger as more registers are discovered. struct Regs { - static constexpr size_t NUM_REGS = 0xE00; - - static constexpr size_t NumRenderTargets = 8; - static constexpr size_t NumViewports = 16; - static constexpr size_t NumCBData = 16; - static constexpr size_t NumVertexArrays = 32; - static constexpr size_t NumVertexAttributes = 32; - static constexpr size_t MaxShaderProgram = 6; - static constexpr size_t MaxShaderStage = 5; + static constexpr std::size_t NUM_REGS = 0xE00; + + static constexpr std::size_t NumRenderTargets = 8; + static constexpr std::size_t NumViewports = 16; + static constexpr std::size_t NumCBData = 16; + static constexpr std::size_t NumVertexArrays = 32; + static constexpr std::size_t NumVertexAttributes = 32; + static constexpr std::size_t MaxShaderProgram = 6; + static constexpr std::size_t MaxShaderStage = 5; // Maximum number of const buffers per shader stage. - static constexpr size_t MaxConstBuffers = 18; + static constexpr std::size_t MaxConstBuffers = 18; enum class QueryMode : u32 { Write = 0, @@ -127,6 +127,7 @@ public: BitField<21, 6, Size> size; BitField<27, 3, Type> type; BitField<31, 1, u32> bgra; + u32 hex; }; u32 ComponentCount() const { @@ -262,6 +263,10 @@ public: bool IsValid() const { return size != Size::Invalid; } + + bool operator<(const VertexAttribute& other) const { + return hex < other.hex; + } }; enum class PrimitiveTopology : u32 { @@ -438,9 +443,9 @@ public: } }; - bool IsShaderConfigEnabled(size_t index) const { + bool IsShaderConfigEnabled(std::size_t index) const { // The VertexB is always enabled. - if (index == static_cast<size_t>(Regs::ShaderProgram::VertexB)) { + if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) { return true; } return shader_config[index].enable != 0; @@ -528,7 +533,11 @@ public: u32 stencil_back_mask; u32 stencil_back_func_mask; - INSERT_PADDING_WORDS(0x20); + INSERT_PADDING_WORDS(0x13); + + u32 rt_separate_frag_data; + + INSERT_PADDING_WORDS(0xC); struct { u32 address_high; @@ -545,14 +554,29 @@ public: INSERT_PADDING_WORDS(0x5B); - VertexAttribute vertex_attrib_format[NumVertexAttributes]; + std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; INSERT_PADDING_WORDS(0xF); struct { union { BitField<0, 4, u32> count; + BitField<4, 3, u32> map_0; + BitField<7, 3, u32> map_1; + BitField<10, 3, u32> map_2; + BitField<13, 3, u32> map_3; + BitField<16, 3, u32> map_4; + BitField<19, 3, u32> map_5; + BitField<22, 3, u32> map_6; + BitField<25, 3, u32> map_7; }; + + u32 GetMap(std::size_t index) const { + const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3, + map_4, map_5, map_6, map_7}; + ASSERT(index < maps.size()); + return maps[index]; + } } rt_control; INSERT_PADDING_WORDS(0x2); @@ -901,7 +925,7 @@ public: std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const; /// Returns the texture information for a specific texture in a specific shader stage. - Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, size_t offset) const; + Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const; private: VideoCore::RasterizerInterface& rasterizer; @@ -963,8 +987,9 @@ ASSERT_REG_POSITION(clear_stencil, 0x368); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); +ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(zeta, 0x3F8); -ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458); +ASSERT_REG_POSITION(vertex_attrib_format, 0x458); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); ASSERT_REG_POSITION(zeta_height, 0x48b); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 6e740713f..aa7481b8c 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -41,7 +41,6 @@ void MaxwellDMA::HandleCopy() { // TODO(Subv): Perform more research and implement all features of this engine. ASSERT(regs.exec.enable_swizzle == 0); - ASSERT(regs.exec.enable_2d == 1); ASSERT(regs.exec.query_mode == Regs::QueryMode::None); ASSERT(regs.exec.query_intr == Regs::QueryIntr::None); ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2); @@ -51,10 +50,19 @@ void MaxwellDMA::HandleCopy() { ASSERT(regs.dst_params.pos_y == 0); if (regs.exec.is_dst_linear == regs.exec.is_src_linear) { - Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count * regs.y_count); + std::size_t copy_size = regs.x_count; + + // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D + // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count). + if (regs.exec.enable_2d) { + copy_size = copy_size * regs.y_count; + } + + Memory::CopyBlock(dest_cpu, source_cpu, copy_size); return; } + ASSERT(regs.exec.enable_2d == 1); u8* src_buffer = Memory::GetPointer(source_cpu); u8* dst_buffer = Memory::GetPointer(dest_cpu); diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 7882f16e0..311ccb616 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -23,7 +23,7 @@ public: void WriteReg(u32 method, u32 value); struct Regs { - static constexpr size_t NUM_REGS = 0x1D6; + static constexpr std::size_t NUM_REGS = 0x1D6; struct Parameters { union { diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 3e4efbe0c..7e1de0fa1 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -20,10 +20,10 @@ namespace Tegra::Shader { struct Register { /// Number of registers - static constexpr size_t NumRegisters = 256; + static constexpr std::size_t NumRegisters = 256; /// Register 255 is special cased to always be 0 - static constexpr size_t ZeroIndex = 255; + static constexpr std::size_t ZeroIndex = 255; enum class Size : u64 { Byte = 0, @@ -67,6 +67,13 @@ private: u64 value{}; }; +enum class AttributeSize : u64 { + Word = 0, + DoubleWord = 1, + TripleWord = 2, + QuadWord = 3, +}; + union Attribute { Attribute() = default; @@ -76,6 +83,7 @@ union Attribute { Position = 7, Attribute_0 = 8, Attribute_31 = 39, + PointCoord = 46, // This attribute contains a tuple of (~, ~, InstanceId, VertexId) when inside a vertex // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval // shader. @@ -86,9 +94,10 @@ union Attribute { }; union { + BitField<20, 10, u64> immediate; BitField<22, 2, u64> element; BitField<24, 6, Index> index; - BitField<47, 3, u64> size; + BitField<47, 3, AttributeSize> size; } fmt20; union { @@ -231,6 +240,41 @@ enum class FlowCondition : u64 { Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for? }; +enum class ControlCode : u64 { + F = 0, + LT = 1, + EQ = 2, + LE = 3, + GT = 4, + NE = 5, + GE = 6, + Num = 7, + Nan = 8, + LTU = 9, + EQU = 10, + LEU = 11, + GTU = 12, + NEU = 13, + GEU = 14, + // + OFF = 16, + LO = 17, + SFF = 18, + LS = 19, + HI = 20, + SFT = 21, + HS = 22, + OFT = 23, + CSM_TA = 24, + CSM_TR = 25, + CSM_MX = 26, + FCSM_TA = 27, + FCSM_TR = 28, + FCSM_MX = 29, + RLE = 30, + RGT = 31, +}; + enum class PredicateResultMode : u64 { None = 0x0, NotZero = 0x3, @@ -243,7 +287,47 @@ enum class TextureType : u64 { TextureCube = 3, }; -enum class IpaMode : u64 { Pass = 0, None = 1, Constant = 2, Sc = 3 }; +enum class TextureQueryType : u64 { + Dimension = 1, + TextureType = 2, + SamplePosition = 5, + Filter = 16, + LevelOfDetail = 18, + Wrap = 20, + BorderColor = 22, +}; + +enum class TextureProcessMode : u64 { + None = 0, + LZ = 1, // Unknown, appears to be the same as none. + LB = 2, // Load Bias. + LL = 3, // Load LOD (LevelOfDetail) + LBA = 6, // Load Bias. The A is unknown, does not appear to differ with LB + LLA = 7 // Load LOD. The A is unknown, does not appear to differ with LL +}; + +enum class TextureMiscMode : u64 { + DC, + AOFFI, // Uses Offset + NDV, + NODEP, + MZ, + PTP, +}; + +enum class IpaInterpMode : u64 { Linear = 0, Perspective = 1, Flat = 2, Sc = 3 }; +enum class IpaSampleMode : u64 { Default = 0, Centroid = 1, Offset = 2 }; + +struct IpaMode { + IpaInterpMode interpolation_mode; + IpaSampleMode sampling_mode; + inline bool operator==(const IpaMode& a) { + return (a.interpolation_mode == interpolation_mode) && (a.sampling_mode == sampling_mode); + } + inline bool operator!=(const IpaMode& a) { + return !((*this) == a); + } +}; union Instruction { Instruction& operator=(const Instruction& instr) { @@ -328,10 +412,16 @@ union Instruction { } alu; union { - BitField<54, 3, IpaMode> mode; + BitField<51, 1, u64> saturate; + BitField<52, 2, IpaSampleMode> sample_mode; + BitField<54, 2, IpaInterpMode> interp_mode; } ipa; union { + BitField<39, 2, u64> tab5cb8_2; + BitField<41, 3, u64> tab5c68_1; + BitField<44, 2, u64> tab5c68_0; + BitField<47, 1, u64> cc; BitField<48, 1, u64> negate_b; } fmul; @@ -395,12 +485,54 @@ union Instruction { } bfe; union { + BitField<48, 3, u64> pred48; + + union { + BitField<20, 20, u64> entry_a; + BitField<39, 5, u64> entry_b; + BitField<45, 1, u64> neg; + BitField<46, 1, u64> uses_cc; + } imm; + + union { + BitField<20, 14, u64> cb_index; + BitField<34, 5, u64> cb_offset; + BitField<56, 1, u64> neg; + BitField<57, 1, u64> uses_cc; + } hi; + + union { + BitField<20, 14, u64> cb_index; + BitField<34, 5, u64> cb_offset; + BitField<39, 5, u64> entry_a; + BitField<45, 1, u64> neg; + BitField<46, 1, u64> uses_cc; + } rz; + + union { + BitField<39, 5, u64> entry_a; + BitField<45, 1, u64> neg; + BitField<46, 1, u64> uses_cc; + } r1; + + union { + BitField<28, 8, u64> entry_a; + BitField<37, 1, u64> neg; + BitField<38, 1, u64> uses_cc; + } r2; + + } lea; + + union { BitField<0, 5, FlowCondition> cond; } flow; union { + BitField<47, 1, u64> cc; BitField<48, 1, u64> negate_b; BitField<49, 1, u64> negate_c; + BitField<51, 2, u64> tab5980_1; + BitField<53, 2, u64> tab5980_0; } ffma; union { @@ -446,6 +578,27 @@ union Instruction { } psetp; union { + BitField<12, 3, u64> pred12; + BitField<15, 1, u64> neg_pred12; + BitField<24, 2, PredOperation> cond; + BitField<29, 3, u64> pred29; + BitField<32, 1, u64> neg_pred29; + BitField<39, 3, u64> pred39; + BitField<42, 1, u64> neg_pred39; + BitField<44, 1, u64> bf; + BitField<45, 2, PredOperation> op; + } pset; + + union { + BitField<0, 3, u64> pred0; + BitField<3, 3, u64> pred3; + BitField<8, 5, ControlCode> cc; // flag in cc + BitField<39, 3, u64> pred39; + BitField<42, 1, u64> neg_pred39; + BitField<45, 4, PredOperation> op; // op with pred39 + } csetp; + + union { BitField<39, 3, u64> pred39; BitField<42, 1, u64> neg_pred; BitField<43, 1, u64> neg_a; @@ -490,25 +643,127 @@ union Instruction { BitField<28, 1, u64> array; BitField<29, 2, TextureType> texture_type; BitField<31, 4, u64> component_mask; + BitField<49, 1, u64> nodep_flag; + BitField<50, 1, u64> dc_flag; + BitField<54, 1, u64> aoffi_flag; + BitField<55, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(size_t component) const { + bool IsComponentEnabled(std::size_t component) const { return ((1ull << component) & component_mask) != 0; } + + TextureProcessMode GetTextureProcessMode() const { + return process_mode; + } + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::DC: + return dc_flag != 0; + case TextureMiscMode::NODEP: + return nodep_flag != 0; + case TextureMiscMode::AOFFI: + return aoffi_flag != 0; + default: + break; + } + return false; + } } tex; union { + BitField<22, 6, TextureQueryType> query_type; + BitField<31, 4, u64> component_mask; + BitField<49, 1, u64> nodep_flag; + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::NODEP: + return nodep_flag != 0; + default: + break; + } + return false; + } + } txq; + + union { BitField<28, 1, u64> array; BitField<29, 2, TextureType> texture_type; + BitField<31, 4, u64> component_mask; + BitField<35, 1, u64> ndv_flag; + BitField<49, 1, u64> nodep_flag; + + bool IsComponentEnabled(std::size_t component) const { + return ((1ull << component) & component_mask) != 0; + } + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::NDV: + return (ndv_flag != 0); + case TextureMiscMode::NODEP: + return (nodep_flag != 0); + default: + break; + } + return false; + } + } tmml; + + union { + BitField<28, 1, u64> array; + BitField<29, 2, TextureType> texture_type; + BitField<35, 1, u64> ndv_flag; + BitField<49, 1, u64> nodep_flag; + BitField<50, 1, u64> dc_flag; + BitField<54, 2, u64> info; BitField<56, 2, u64> component; + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::NDV: + return ndv_flag != 0; + case TextureMiscMode::NODEP: + return nodep_flag != 0; + case TextureMiscMode::DC: + return dc_flag != 0; + case TextureMiscMode::AOFFI: + return info == 1; + case TextureMiscMode::PTP: + return info == 2; + default: + break; + } + return false; + } } tld4; union { + BitField<49, 1, u64> nodep_flag; + BitField<50, 1, u64> dc_flag; + BitField<51, 1, u64> aoffi_flag; BitField<52, 2, u64> component; + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::DC: + return dc_flag != 0; + case TextureMiscMode::NODEP: + return nodep_flag != 0; + case TextureMiscMode::AOFFI: + return aoffi_flag != 0; + default: + break; + } + return false; + } } tld4s; union { BitField<0, 8, Register> gpr0; BitField<28, 8, Register> gpr28; + BitField<49, 1, u64> nodep_flag; BitField<50, 3, u64> component_mask_selector; BitField<53, 4, u64> texture_info; @@ -528,6 +783,37 @@ union Instruction { UNREACHABLE(); } + TextureProcessMode GetTextureProcessMode() const { + switch (texture_info) { + case 0: + case 2: + case 6: + case 8: + case 9: + case 11: + return TextureProcessMode::LZ; + case 3: + case 5: + case 13: + return TextureProcessMode::LL; + default: + break; + } + return TextureProcessMode::None; + } + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::DC: + return (texture_info >= 4 && texture_info <= 6) || texture_info == 9; + case TextureMiscMode::NODEP: + return nodep_flag != 0; + default: + break; + } + return false; + } + bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info >= 7 && texture_info <= 9; @@ -537,7 +823,7 @@ union Instruction { return gpr28.Value() != Register::ZeroIndex; } - bool IsComponentEnabled(size_t component) const { + bool IsComponentEnabled(std::size_t component) const { static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{ {}, {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc}, @@ -545,7 +831,7 @@ union Instruction { {0x7, 0xb, 0xd, 0xe, 0xf}, }}; - size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U}; + std::size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U}; index |= gpr28.Value() != Register::ZeroIndex ? 2 : 0; u32 mask = mask_lut[index][component_mask_selector]; @@ -556,6 +842,7 @@ union Instruction { } texs; union { + BitField<49, 1, u64> nodep_flag; BitField<53, 4, u64> texture_info; TextureType GetTextureType() const { @@ -576,6 +863,26 @@ union Instruction { UNREACHABLE(); } + TextureProcessMode GetTextureProcessMode() const { + if (texture_info == 1 || texture_info == 5 || texture_info == 12) + return TextureProcessMode::LL; + return TextureProcessMode::LZ; + } + + bool UsesMiscMode(TextureMiscMode mode) const { + switch (mode) { + case TextureMiscMode::AOFFI: + return texture_info == 12 || texture_info == 4; + case TextureMiscMode::MZ: + return texture_info == 5; + case TextureMiscMode::NODEP: + return nodep_flag != 0; + default: + break; + } + return false; + } + bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info == 8; @@ -618,6 +925,7 @@ union Instruction { BitField<36, 5, u64> index; } cbuf36; + BitField<47, 1, u64> generates_cc; BitField<61, 1, u64> is_b_imm; BitField<60, 1, u64> is_b_gpr; BitField<59, 1, u64> is_c_gpr; @@ -647,11 +955,13 @@ public: LDG, // Load from global memory STG, // Store in global memory TEX, - TEXQ, // Texture Query - TEXS, // Texture Fetch with scalar/non-vec4 source/destinations - TLDS, // Texture Load with scalar/non-vec4 source/destinations - TLD4, // Texture Load 4 - TLD4S, // Texture Load 4 with scalar / non - vec4 source / destinations + TXQ, // Texture Query + TEXS, // Texture Fetch with scalar/non-vec4 source/destinations + TLDS, // Texture Load with scalar/non-vec4 source/destinations + TLD4, // Texture Load 4 + TLD4S, // Texture Load 4 with scalar / non - vec4 source / destinations + TMML_B, // Texture Mip Map Level + TMML, // Texture Mip Map Level EXIT, IPA, FFMA_IMM, // Fused Multiply and Add @@ -676,6 +986,11 @@ public: ISCADD_C, // Scale and Add ISCADD_R, ISCADD_IMM, + LEA_R1, + LEA_R2, + LEA_RZ, + LEA_IMM, + LEA_HI, POPC_C, POPC_R, POPC_IMM, @@ -734,6 +1049,8 @@ public: ISET_C, ISET_IMM, PSETP, + PSET, + CSETP, XMAD_IMM, XMAD_CR, XMAD_RC, @@ -757,6 +1074,7 @@ public: IntegerSet, IntegerSetPredicate, PredicateSetPredicate, + PredicateSetRegister, Conversion, Xmad, Unknown, @@ -821,7 +1139,7 @@ public: private: struct Detail { private: - static constexpr size_t opcode_bitsize = 16; + static constexpr std::size_t opcode_bitsize = 16; /** * Generates the mask and the expected value after masking from a given bitstring. @@ -830,8 +1148,8 @@ private: */ static auto GetMaskAndExpect(const char* const bitstring) { u16 mask = 0, expect = 0; - for (size_t i = 0; i < opcode_bitsize; i++) { - const size_t bit_position = opcode_bitsize - i - 1; + for (std::size_t i = 0; i < opcode_bitsize; i++) { + const std::size_t bit_position = opcode_bitsize - i - 1; switch (bitstring[i]) { case '0': mask |= 1 << bit_position; @@ -871,11 +1189,13 @@ private: INST("1110111011010---", Id::LDG, Type::Memory, "LDG"), INST("1110111011011---", Id::STG, Type::Memory, "STG"), INST("110000----111---", Id::TEX, Type::Memory, "TEX"), - INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"), + INST("1101111101001---", Id::TXQ, Type::Memory, "TXQ"), INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"), INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"), INST("110010----111---", Id::TLD4, Type::Memory, "TLD4"), INST("1101111100------", Id::TLD4S, Type::Memory, "TLD4S"), + INST("110111110110----", Id::TMML_B, Type::Memory, "TMML_B"), + INST("1101111101011---", Id::TMML, Type::Memory, "TMML"), INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"), INST("11100000--------", Id::IPA, Type::Trivial, "IPA"), INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"), @@ -906,6 +1226,11 @@ private: INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"), INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"), INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"), + INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"), + INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"), + INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"), + INST("010010111101----", Id::LEA_RZ, Type::ArithmeticInteger, "LEA_RZ"), + INST("00011000--------", Id::LEA_HI, Type::ArithmeticInteger, "LEA_HI"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), @@ -960,7 +1285,9 @@ private: INST("010110110101----", Id::ISET_R, Type::IntegerSet, "ISET_R"), INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"), INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"), + INST("0101000010001---", Id::PSET, Type::PredicateSetRegister, "PSET"), INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"), + INST("010100001010----", Id::CSETP, Type::PredicateSetPredicate, "CSETP"), INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"), INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"), INST("010100010-------", Id::XMAD_RC, Type::Xmad, "XMAD_RC"), diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h new file mode 100644 index 000000000..a885ee3cf --- /dev/null +++ b/src/video_core/engines/shader_header.h @@ -0,0 +1,103 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::Shader { + +enum class OutputTopology : u32 { + PointList = 1, + LineStrip = 6, + TriangleStrip = 7, +}; + +// Documentation in: +// http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html#ImapTexture +struct Header { + union { + BitField<0, 5, u32> sph_type; + BitField<5, 5, u32> version; + BitField<10, 4, u32> shader_type; + BitField<14, 1, u32> mrt_enable; + BitField<15, 1, u32> kills_pixels; + BitField<16, 1, u32> does_global_store; + BitField<17, 4, u32> sass_version; + BitField<21, 5, u32> reserved; + BitField<26, 1, u32> does_load_or_store; + BitField<27, 1, u32> does_fp64; + BitField<28, 4, u32> stream_out_mask; + } common0; + + union { + BitField<0, 24, u32> shader_local_memory_low_size; + BitField<24, 8, u32> per_patch_attribute_count; + } common1; + + union { + BitField<0, 24, u32> shader_local_memory_high_size; + BitField<24, 8, u32> threads_per_input_primitive; + } common2; + + union { + BitField<0, 24, u32> shader_local_memory_crs_size; + BitField<24, 4, OutputTopology> output_topology; + BitField<28, 4, u32> reserved; + } common3; + + union { + BitField<0, 12, u32> max_output_vertices; + BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders. + BitField<24, 4, u32> reserved; + BitField<12, 8, u32> store_req_end; // NOTE: not used by geometry shaders. + } common4; + + union { + struct { + INSERT_PADDING_BYTES(3); // ImapSystemValuesA + INSERT_PADDING_BYTES(1); // ImapSystemValuesB + INSERT_PADDING_BYTES(16); // ImapGenericVector[32] + INSERT_PADDING_BYTES(2); // ImapColor + INSERT_PADDING_BYTES(2); // ImapSystemValuesC + INSERT_PADDING_BYTES(5); // ImapFixedFncTexture[10] + INSERT_PADDING_BYTES(1); // ImapReserved + INSERT_PADDING_BYTES(3); // OmapSystemValuesA + INSERT_PADDING_BYTES(1); // OmapSystemValuesB + INSERT_PADDING_BYTES(16); // OmapGenericVector[32] + INSERT_PADDING_BYTES(2); // OmapColor + INSERT_PADDING_BYTES(2); // OmapSystemValuesC + INSERT_PADDING_BYTES(5); // OmapFixedFncTexture[10] + INSERT_PADDING_BYTES(1); // OmapReserved + } vtg; + + struct { + INSERT_PADDING_BYTES(3); // ImapSystemValuesA + INSERT_PADDING_BYTES(1); // ImapSystemValuesB + INSERT_PADDING_BYTES(32); // ImapGenericVector[32] + INSERT_PADDING_BYTES(2); // ImapColor + INSERT_PADDING_BYTES(2); // ImapSystemValuesC + INSERT_PADDING_BYTES(10); // ImapFixedFncTexture[10] + INSERT_PADDING_BYTES(2); // ImapReserved + struct { + u32 target; + union { + BitField<0, 1, u32> sample_mask; + BitField<1, 1, u32> depth; + BitField<2, 30, u32> reserved; + }; + } omap; + bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const { + const u32 bit = render_target * 4 + component; + return omap.target & (1 << bit); + } + } ps; + }; +}; + +static_assert(sizeof(Header) == 0x50, "Incorrect structure size"); + +} // namespace Tegra::Shader diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index e6d8e65c6..baa8b63b7 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -4,6 +4,7 @@ #include "common/assert.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_compute.h" #include "video_core/engines/maxwell_dma.h" @@ -27,6 +28,7 @@ GPU::GPU(VideoCore::RasterizerInterface& rasterizer) { fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager); maxwell_compute = std::make_unique<Engines::MaxwellCompute>(); maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager); + kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager); } GPU::~GPU() = default; @@ -66,6 +68,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { case RenderTargetFormat::RGBA8_UINT: case RenderTargetFormat::RGB10_A2_UNORM: case RenderTargetFormat::BGRA8_UNORM: + case RenderTargetFormat::BGRA8_SRGB: case RenderTargetFormat::RG16_UNORM: case RenderTargetFormat::RG16_SNORM: case RenderTargetFormat::RG16_UINT: diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 2c3dbd97b..5cc1e19ca 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -4,8 +4,9 @@ #pragma once +#include <array> #include <memory> -#include <unordered_map> +#include <vector> #include "common/common_types.h" #include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/memory_manager.h" @@ -26,6 +27,7 @@ enum class RenderTargetFormat : u32 { RG32_FLOAT = 0xCB, RG32_UINT = 0xCD, BGRA8_UNORM = 0xCF, + BGRA8_SRGB = 0xD0, RGB10_A2_UNORM = 0xD1, RGBA8_UNORM = 0xD5, RGBA8_SRGB = 0xD6, @@ -40,6 +42,7 @@ enum class RenderTargetFormat : u32 { R32_UINT = 0xE4, R32_FLOAT = 0xE5, B5G6R5_UNORM = 0xE8, + BGR5A1_UNORM = 0xE9, RG8_UNORM = 0xEA, RG8_SNORM = 0xEB, R16_UNORM = 0xEE, @@ -67,6 +70,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format); /// Returns the number of bytes per pixel of each depth format. u32 DepthFormatBytesPerPixel(DepthFormat format); +struct CommandListHeader; class DebugContext; /** @@ -99,6 +103,7 @@ class Fermi2D; class Maxwell3D; class MaxwellCompute; class MaxwellDMA; +class KeplerMemory; } // namespace Engines enum class EngineID { @@ -115,7 +120,7 @@ public: ~GPU(); /// Processes a command list stored at the specified address in GPU memory. - void ProcessCommandList(GPUVAddr address, u32 size); + void ProcessCommandLists(const std::vector<CommandListHeader>& commands); /// Returns a reference to the Maxwell3D GPU engine. Engines::Maxwell3D& Maxwell3D(); @@ -130,13 +135,10 @@ public: const Tegra::MemoryManager& MemoryManager() const; private: - /// Writes a single register in the engine bound to the specified subchannel - void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params); - std::unique_ptr<Tegra::MemoryManager> memory_manager; /// Mapping of command subchannels to their bound engine ids. - std::unordered_map<u32, EngineID> bound_engines; + std::array<EngineID, 8> bound_engines = {}; /// 3D engine std::unique_ptr<Engines::Maxwell3D> maxwell_3d; @@ -146,6 +148,8 @@ private: std::unique_ptr<Engines::MaxwellCompute> maxwell_compute; /// DMA engine std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; + /// Inline memory engine + std::unique_ptr<Engines::KeplerMemory> kepler_memory; }; } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h index 7d836b816..cee0baaf3 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro_interpreter.h @@ -152,7 +152,7 @@ private: boost::optional<u32> delayed_pc; ///< Program counter to execute at after the delay slot is executed. - static constexpr size_t NumMacroRegisters = 8; + static constexpr std::size_t NumMacroRegisters = 8; /// General purpose macro registers. std::array<u32, NumMacroRegisters> registers = {}; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 9d78e8b6b..cd819d69f 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -20,9 +20,6 @@ public: /// Clear the current framebuffer virtual void Clear() = 0; - /// Notify rasterizer that the specified Maxwell register has been changed - virtual void NotifyMaxwellRegisterChanged(u32 method) = 0; - /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp index be17a2b9c..0df3725c2 100644 --- a/src/video_core/renderer_base.cpp +++ b/src/video_core/renderer_base.cpp @@ -19,6 +19,7 @@ void RendererBase::RefreshBaseSettings() { UpdateCurrentFramebufferLayout(); renderer_settings.use_framelimiter = Settings::values.use_frame_limit; + renderer_settings.set_background_color = true; } void RendererBase::UpdateCurrentFramebufferLayout() { diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 2a357f9d0..2cd0738ff 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -19,6 +19,7 @@ namespace VideoCore { struct RendererSettings { std::atomic_bool use_framelimiter{false}; + std::atomic_bool set_background_color{false}; }; class RendererBase : NonCopyable { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp new file mode 100644 index 000000000..578aca789 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -0,0 +1,93 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <memory> + +#include "common/alignment.h" +#include "core/core.h" +#include "core/memory.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace OpenGL { + +OGLBufferCache::OGLBufferCache(std::size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {} + +GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, + std::size_t alignment, bool cache) { + auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); + const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; + + // Cache management is a big overhead, so only cache entries with a given size. + // TODO: Figure out which size is the best for given games. + cache &= size >= 2048; + + if (cache) { + auto entry = TryGet(*cpu_addr); + if (entry) { + if (entry->size >= size && entry->alignment == alignment) { + return entry->offset; + } + Unregister(entry); + } + } + + AlignBuffer(alignment); + GLintptr uploaded_offset = buffer_offset; + + Memory::ReadBlock(*cpu_addr, buffer_ptr, size); + + buffer_ptr += size; + buffer_offset += size; + + if (cache) { + auto entry = std::make_shared<CachedBufferEntry>(); + entry->offset = uploaded_offset; + entry->size = size; + entry->alignment = alignment; + entry->addr = *cpu_addr; + Register(entry); + } + + return uploaded_offset; +} + +GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size, + std::size_t alignment) { + AlignBuffer(alignment); + std::memcpy(buffer_ptr, raw_pointer, size); + GLintptr uploaded_offset = buffer_offset; + + buffer_ptr += size; + buffer_offset += size; + return uploaded_offset; +} + +void OGLBufferCache::Map(std::size_t max_size) { + bool invalidate; + std::tie(buffer_ptr, buffer_offset_base, invalidate) = + stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4); + buffer_offset = buffer_offset_base; + + if (invalidate) { + InvalidateAll(); + } +} +void OGLBufferCache::Unmap() { + stream_buffer.Unmap(buffer_offset - buffer_offset_base); +} + +GLuint OGLBufferCache::GetHandle() const { + return stream_buffer.GetHandle(); +} + +void OGLBufferCache::AlignBuffer(std::size_t alignment) { + // Align the offset, not the mapped pointer + GLintptr offset_aligned = + static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment)); + buffer_ptr += offset_aligned - buffer_offset; + buffer_offset = offset_aligned; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h new file mode 100644 index 000000000..6c18461f4 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -0,0 +1,57 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <memory> + +#include "common/common_types.h" +#include "video_core/rasterizer_cache.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_stream_buffer.h" + +namespace OpenGL { + +struct CachedBufferEntry final { + VAddr GetAddr() const { + return addr; + } + + std::size_t GetSizeInBytes() const { + return size; + } + + VAddr addr; + std::size_t size; + GLintptr offset; + std::size_t alignment; +}; + +class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { +public: + explicit OGLBufferCache(std::size_t size); + + GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, + bool cache = true); + + GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4); + + void Map(std::size_t max_size); + void Unmap(); + + GLuint GetHandle() const; + +protected: + void AlignBuffer(std::size_t alignment); + +private: + OGLStreamBuffer stream_buffer; + + u8* buffer_ptr = nullptr; + GLintptr buffer_offset = 0; + GLintptr buffer_offset_base = 0; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7ce969f73..70fb54507 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <algorithm> +#include <array> #include <memory> #include <string> #include <string_view> @@ -33,16 +34,19 @@ using PixelFormat = SurfaceParams::PixelFormat; using SurfaceType = SurfaceParams::SurfaceType; MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_VS, "OpenGL", "Vertex Shader Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_FS, "OpenGL", "Fragment Shader Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255)); +MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info) - : emu_window{window}, screen_info{info}, stream_buffer(GL_ARRAY_BUFFER, STREAM_BUFFER_SIZE) { + : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) { // Create sampler objects - for (size_t i = 0; i < texture_samplers.size(); ++i) { + for (std::size_t i = 0; i < texture_samplers.size(); ++i) { texture_samplers[i].Create(); state.texture_units[i].sampler = texture_samplers[i].sampler.handle; } @@ -55,6 +59,8 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo if (extension == "GL_ARB_direct_state_access") { has_ARB_direct_state_access = true; + } else if (extension == "GL_ARB_multi_bind") { + has_ARB_multi_bind = true; } else if (extension == "GL_ARB_separate_shader_objects") { has_ARB_separate_shader_objects = true; } else if (extension == "GL_ARB_vertex_attrib_binding") { @@ -67,28 +73,13 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0 state.clip_distance[0] = true; - // Generate VAO and UBO - sw_vao.Create(); - uniform_buffer.Create(); - - state.draw.vertex_array = sw_vao.handle; - state.draw.uniform_buffer = uniform_buffer.handle; - state.Apply(); - // Create render framebuffer framebuffer.Create(); - hw_vao.Create(); - - state.draw.vertex_buffer = stream_buffer.GetHandle(); - shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; - state.draw.vertex_array = hw_vao.handle; state.Apply(); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer.GetHandle()); - glEnable(GL_BLEND); glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); @@ -98,14 +89,60 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo RasterizerOpenGL::~RasterizerOpenGL() {} -std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr, - GLintptr buffer_offset) { +void RasterizerOpenGL::SetupVertexArrays() { MICROPROFILE_SCOPE(OpenGL_VAO); const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); const auto& regs = gpu.regs; - state.draw.vertex_array = hw_vao.handle; - state.draw.vertex_buffer = stream_buffer.GetHandle(); + auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format); + auto& VAO = iter->second; + + if (is_cache_miss) { + VAO.Create(); + state.draw.vertex_array = VAO.handle; + state.Apply(); + + // The index buffer binding is stored within the VAO. Stupid OpenGL, but easy to work + // around. + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer_cache.GetHandle()); + + // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. + // Enables the first 16 vertex attributes always, as we don't know which ones are actually + // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16 + // for now to avoid OpenGL errors. + // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't + // assume every shader uses them all. + for (unsigned index = 0; index < 16; ++index) { + const auto& attrib = regs.vertex_attrib_format[index]; + + // Ignore invalid attributes. + if (!attrib.IsValid()) + continue; + + const auto& buffer = regs.vertex_array[attrib.buffer]; + LOG_TRACE(HW_GPU, + "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}", + index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(), + attrib.offset.Value(), attrib.IsNormalized()); + + ASSERT(buffer.IsEnabled()); + + glEnableVertexAttribArray(index); + if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt || + attrib.type == + Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) { + glVertexAttribIFormat(index, attrib.ComponentCount(), + MaxwellToGL::VertexType(attrib), attrib.offset); + } else { + glVertexAttribFormat(index, attrib.ComponentCount(), + MaxwellToGL::VertexType(attrib), + attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); + } + glVertexAttribBinding(index, attrib.buffer); + } + } + state.draw.vertex_array = VAO.handle; + state.draw.vertex_buffer = buffer_cache.GetHandle(); state.Apply(); // Upload all guest vertex arrays sequentially to our buffer @@ -117,77 +154,35 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr, Tegra::GPUVAddr start = vertex_array.StartAddress(); const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { - start += vertex_array.stride * (gpu.state.current_instance / vertex_array.divisor); - } - ASSERT(end > start); - u64 size = end - start + 1; - - GLintptr vertex_buffer_offset; - std::tie(array_ptr, buffer_offset, vertex_buffer_offset) = - UploadMemory(array_ptr, buffer_offset, start, size); + const u64 size = end - start + 1; + const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size); // Bind the vertex array to the buffer at the current offset. - glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset, + glBindVertexBuffer(index, buffer_cache.GetHandle(), vertex_buffer_offset, vertex_array.stride); if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { - // Tell OpenGL that this is an instanced vertex buffer to prevent accessing different - // indexes on each vertex. We do the instance indexing manually by incrementing the - // start address of the vertex buffer. - glVertexBindingDivisor(index, 1); + // Enable vertex buffer instancing with the specified divisor. + glVertexBindingDivisor(index, vertex_array.divisor); } else { // Disable the vertex buffer instancing. glVertexBindingDivisor(index, 0); } } - - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. - // Enables the first 16 vertex attributes always, as we don't know which ones are actually used - // until shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now - // to avoid OpenGL errors. - // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't - // assume every shader uses them all. - for (unsigned index = 0; index < 16; ++index) { - auto& attrib = regs.vertex_attrib_format[index]; - - // Ignore invalid attributes. - if (!attrib.IsValid()) - continue; - - auto& buffer = regs.vertex_array[attrib.buffer]; - LOG_TRACE(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}", - index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(), - attrib.offset.Value(), attrib.IsNormalized()); - - ASSERT(buffer.IsEnabled()); - - glEnableVertexAttribArray(index); - if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt || - attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) { - glVertexAttribIFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), - attrib.offset); - } else { - glVertexAttribFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), - attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); - } - glVertexAttribBinding(index, attrib.buffer); - } - - return {array_ptr, buffer_offset}; } -std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) { - auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); +void RasterizerOpenGL::SetupShaders() { + MICROPROFILE_SCOPE(OpenGL_Shader); + const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); // Next available bindpoints to use when uploading the const buffers and textures to the GLSL // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points. u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage; u32 current_texture_bindpoint = 0; - for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - auto& shader_config = gpu.regs.shader_config[index]; + for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { + const auto& shader_config = gpu.regs.shader_config[index]; const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled @@ -195,21 +190,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr continue; } - std::tie(buffer_ptr, buffer_offset) = - AlignBuffer(buffer_ptr, buffer_offset, static_cast<size_t>(uniform_buffer_alignment)); - - const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5 + const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5 GLShader::MaxwellUniformData ubo{}; ubo.SetFromRegs(gpu.state.shader_stages[stage]); - std::memcpy(buffer_ptr, &ubo, sizeof(ubo)); + const GLintptr offset = buffer_cache.UploadHostMemory( + &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); // Bind the buffer - glBindBufferRange(GL_UNIFORM_BUFFER, stage, stream_buffer.GetHandle(), buffer_offset, - sizeof(ubo)); - - buffer_ptr += sizeof(ubo); - buffer_offset += sizeof(ubo); + glBindBufferRange(GL_UNIFORM_BUFFER, stage, buffer_cache.GetHandle(), offset, sizeof(ubo)); Shader shader{shader_cache.GetStageProgram(program)}; @@ -230,9 +219,8 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr } // Configure the const buffers for this shader stage. - std::tie(buffer_ptr, buffer_offset, current_constbuffer_bindpoint) = - SetupConstBuffers(buffer_ptr, buffer_offset, static_cast<Maxwell::ShaderStage>(stage), - shader, current_constbuffer_bindpoint); + current_constbuffer_bindpoint = SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), + shader, current_constbuffer_bindpoint); // Configure the textures for this shader stage. current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader, @@ -245,15 +233,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr } } - shader_program_manager->UseTrivialGeometryShader(); + state.Apply(); - return {buffer_ptr, buffer_offset}; + shader_program_manager->UseTrivialGeometryShader(); } -size_t RasterizerOpenGL::CalculateVertexArraysSize() const { +std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - size_t size = 0; + std::size_t size = 0; for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { if (!regs.vertex_array[index].IsEnabled()) continue; @@ -309,60 +297,80 @@ void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { cached_pages.add({pages_interval, delta}); } -std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, - bool using_depth_fb, - bool preserve_contents) { +void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb, + bool preserve_contents, + boost::optional<std::size_t> single_color_target) { + MICROPROFILE_SCOPE(OpenGL_Framebuffer); const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - if (regs.rt[0].format == Tegra::RenderTargetFormat::NONE) { - LOG_ERROR(HW_GPU, "RenderTargetFormat is not configured"); - using_color_fb = false; + Surface depth_surface; + if (using_depth_fb) { + depth_surface = res_cache.GetDepthBufferSurface(preserve_contents); } - const bool has_stencil = regs.stencil_enable; - const bool write_color_fb = - state.color_mask.red_enabled == GL_TRUE || state.color_mask.green_enabled == GL_TRUE || - state.color_mask.blue_enabled == GL_TRUE || state.color_mask.alpha_enabled == GL_TRUE; + // TODO(bunnei): Figure out how the below register works. According to envytools, this should be + // used to enable multiple render targets. However, it is left unset on all games that I have + // tested. + ASSERT_MSG(regs.rt_separate_frag_data == 0, "Unimplemented"); - const bool write_depth_fb = - (state.depth.test_enabled && state.depth.write_mask == GL_TRUE) || - (has_stencil && (state.stencil.front.write_mask || state.stencil.back.write_mask)); - - Surface color_surface; - Surface depth_surface; - MathUtil::Rectangle<u32> surfaces_rect; - std::tie(color_surface, depth_surface, surfaces_rect) = - res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb, preserve_contents); + // Bind the framebuffer surfaces + state.draw.draw_framebuffer = framebuffer.handle; + state.Apply(); - const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()}; - const MathUtil::Rectangle<u32> draw_rect{ - static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left, - surfaces_rect.left, surfaces_rect.right)), // Left - static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top, - surfaces_rect.bottom, surfaces_rect.top)), // Top - static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.right, - surfaces_rect.left, surfaces_rect.right)), // Right - static_cast<u32>( - std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.bottom, - surfaces_rect.bottom, surfaces_rect.top))}; // Bottom + if (using_color_fb) { + if (single_color_target) { + // Used when just a single color attachment is enabled, e.g. for clearing a color buffer + Surface color_surface = + res_cache.GetColorBufferSurface(*single_color_target, preserve_contents); + glFramebufferTexture2D( + GL_DRAW_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target), GL_TEXTURE_2D, + color_surface != nullptr ? color_surface->Texture().handle : 0, 0); + glDrawBuffer(GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target)); + } else { + // Multiple color attachments are enabled + std::array<GLenum, Maxwell::NumRenderTargets> buffers; + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents); + buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index); + glFramebufferTexture2D( + GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), + GL_TEXTURE_2D, color_surface != nullptr ? color_surface->Texture().handle : 0, + 0); + } + glDrawBuffers(regs.rt_control.count, buffers.data()); + } + } else { + // No color attachments are enabled - zero out all of them + for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), GL_TEXTURE_2D, + 0, 0); + } + glDrawBuffer(GL_NONE); + } - // Bind the framebuffer surfaces - BindFramebufferSurfaces(color_surface, depth_surface, has_stencil); + if (depth_surface) { + if (regs.stencil_enable) { + // Attach both depth and stencil + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, + depth_surface->Texture().handle, 0); + } else { + // Attach depth + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, + depth_surface->Texture().handle, 0); + // Clear stencil attachment + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + } + } else { + // Clear both depth and stencil attachment + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, + 0); + } - SyncViewport(surfaces_rect); + SyncViewport(); - // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable - // scissor test to prevent drawing outside of the framebuffer region - state.scissor.enabled = true; - state.scissor.x = draw_rect.left; - state.scissor.y = draw_rect.bottom; - state.scissor.width = draw_rect.GetWidth(); - state.scissor.height = draw_rect.GetHeight(); state.Apply(); - - // Only return the surface to be marked as dirty if writing to it is enabled. - return std::make_pair(write_color_fb ? color_surface : nullptr, - write_depth_fb ? depth_surface : nullptr); } void RasterizerOpenGL::Clear() { @@ -370,32 +378,24 @@ void RasterizerOpenGL::Clear() { SCOPE_EXIT({ prev_state.Apply(); }); const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; - bool use_color_fb = false; - bool use_depth_fb = false; + bool use_color{}; + bool use_depth{}; + bool use_stencil{}; OpenGLState clear_state; - clear_state.draw.draw_framebuffer = state.draw.draw_framebuffer; + clear_state.draw.draw_framebuffer = framebuffer.handle; clear_state.color_mask.red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE; clear_state.color_mask.green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE; clear_state.color_mask.blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE; clear_state.color_mask.alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE; - GLbitfield clear_mask{}; if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { - if (regs.clear_buffers.RT == 0) { - // We only support clearing the first color attachment for now - clear_mask |= GL_COLOR_BUFFER_BIT; - use_color_fb = true; - } else { - // TODO(subv): Add support for the other color attachments - LOG_CRITICAL(HW_GPU, "Clear unimplemented for RT {}", regs.clear_buffers.RT); - } + use_color = true; } if (regs.clear_buffers.Z) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!"); - use_depth_fb = true; - clear_mask |= GL_DEPTH_BUFFER_BIT; + use_depth = true; // Always enable the depth write when clearing the depth buffer. The depth write mask is // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to true. @@ -404,59 +404,33 @@ void RasterizerOpenGL::Clear() { } if (regs.clear_buffers.S) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); - use_depth_fb = true; - clear_mask |= GL_STENCIL_BUFFER_BIT; + use_stencil = true; clear_state.stencil.test_enabled = true; } - if (!use_color_fb && !use_depth_fb) { + if (!use_color && !use_depth && !use_stencil) { // No color surface nor depth/stencil surface are enabled return; } - if (clear_mask == 0) { - // No clear mask is enabled - return; - } - ScopeAcquireGLContext acquire_context{emu_window}; - auto [dirty_color_surface, dirty_depth_surface] = - ConfigureFramebuffers(use_color_fb, use_depth_fb, false); + ConfigureFramebuffers(use_color, use_depth || use_stencil, false, + regs.clear_buffers.RT.Value()); clear_state.Apply(); - glClearColor(regs.clear_color[0], regs.clear_color[1], regs.clear_color[2], - regs.clear_color[3]); - glClearDepth(regs.clear_depth); - glClearStencil(regs.clear_stencil); - - glClear(clear_mask); -} - -std::pair<u8*, GLintptr> RasterizerOpenGL::AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, - size_t alignment) { - // Align the offset, not the mapped pointer - GLintptr offset_aligned = - static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment)); - return {buffer_ptr + (offset_aligned - buffer_offset), offset_aligned}; -} - -std::tuple<u8*, GLintptr, GLintptr> RasterizerOpenGL::UploadMemory(u8* buffer_ptr, - GLintptr buffer_offset, - Tegra::GPUVAddr gpu_addr, - size_t size, size_t alignment) { - std::tie(buffer_ptr, buffer_offset) = AlignBuffer(buffer_ptr, buffer_offset, alignment); - GLintptr uploaded_offset = buffer_offset; - - auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); - const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)}; - Memory::ReadBlock(*cpu_addr, buffer_ptr, size); - - buffer_ptr += size; - buffer_offset += size; + if (use_color) { + glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); + } - return {buffer_ptr, buffer_offset, uploaded_offset}; + if (use_depth && use_stencil) { + glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); + } else if (use_depth) { + glClearBufferfv(GL_DEPTH, 0, ®s.clear_depth); + } else if (use_stencil) { + glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); + } } void RasterizerOpenGL::DrawArrays() { @@ -464,12 +438,12 @@ void RasterizerOpenGL::DrawArrays() { return; MICROPROFILE_SCOPE(OpenGL_Drawing); - const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; + const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); + const auto& regs = gpu.regs; ScopeAcquireGLContext acquire_context{emu_window}; - auto [dirty_color_surface, dirty_depth_surface] = - ConfigureFramebuffers(true, regs.zeta.Address() != 0 && regs.zeta_enable != 0, true); + ConfigureFramebuffers(); SyncDepthTestState(); SyncStencilTestState(); @@ -482,43 +456,46 @@ void RasterizerOpenGL::DrawArrays() { // Draw the vertex batch const bool is_indexed = accelerate_draw == AccelDraw::Indexed; - const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()}; + const u64 index_buffer_size{static_cast<u64>(regs.index_array.count) * + static_cast<u64>(regs.index_array.FormatSizeInBytes())}; - state.draw.vertex_buffer = stream_buffer.GetHandle(); + state.draw.vertex_buffer = buffer_cache.GetHandle(); state.Apply(); - size_t buffer_size = CalculateVertexArraysSize(); + std::size_t buffer_size = CalculateVertexArraysSize(); if (is_indexed) { - buffer_size = Common::AlignUp<size_t>(buffer_size, 4) + index_buffer_size; + buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + index_buffer_size; } // Uniform space for the 5 shader stages buffer_size = - Common::AlignUp<size_t>(buffer_size, 4) + + Common::AlignUp<std::size_t>(buffer_size, 4) + (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment); - u8* buffer_ptr; - GLintptr buffer_offset; - std::tie(buffer_ptr, buffer_offset, std::ignore) = - stream_buffer.Map(static_cast<GLsizeiptr>(buffer_size), 4); - u8* buffer_ptr_base = buffer_ptr; + buffer_cache.Map(buffer_size); - std::tie(buffer_ptr, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset); + SetupVertexArrays(); // If indexed mode, copy the index buffer GLintptr index_buffer_offset = 0; if (is_indexed) { - std::tie(buffer_ptr, buffer_offset, index_buffer_offset) = UploadMemory( - buffer_ptr, buffer_offset, regs.index_array.StartAddress(), index_buffer_size); + MICROPROFILE_SCOPE(OpenGL_Index); + + // Adjust the index buffer offset so it points to the first desired index. + auto index_start = regs.index_array.StartAddress(); + index_start += static_cast<size_t>(regs.index_array.first) * + static_cast<size_t>(regs.index_array.FormatSizeInBytes()); + + index_buffer_offset = buffer_cache.UploadMemory(index_start, index_buffer_size); } - std::tie(buffer_ptr, buffer_offset) = SetupShaders(buffer_ptr, buffer_offset); + SetupShaders(); - stream_buffer.Unmap(buffer_ptr - buffer_ptr_base); + buffer_cache.Unmap(); shader_program_manager->ApplyTo(state); state.Apply(); @@ -527,14 +504,26 @@ void RasterizerOpenGL::DrawArrays() { if (is_indexed) { const GLint base_vertex{static_cast<GLint>(regs.vb_element_base)}; - // Adjust the index buffer offset so it points to the first desired index. - index_buffer_offset += regs.index_array.first * regs.index_array.FormatSizeInBytes(); - - glDrawElementsBaseVertex(primitive_mode, regs.index_array.count, - MaxwellToGL::IndexFormat(regs.index_array.format), - reinterpret_cast<const void*>(index_buffer_offset), base_vertex); + if (gpu.state.current_instance > 0) { + glDrawElementsInstancedBaseVertexBaseInstance( + primitive_mode, regs.index_array.count, + MaxwellToGL::IndexFormat(regs.index_array.format), + reinterpret_cast<const void*>(index_buffer_offset), 1, base_vertex, + gpu.state.current_instance); + } else { + glDrawElementsBaseVertex(primitive_mode, regs.index_array.count, + MaxwellToGL::IndexFormat(regs.index_array.format), + reinterpret_cast<const void*>(index_buffer_offset), + base_vertex); + } } else { - glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count); + if (gpu.state.current_instance > 0) { + glDrawArraysInstancedBaseInstance(primitive_mode, regs.vertex_buffer.first, + regs.vertex_buffer.count, 1, + gpu.state.current_instance); + } else { + glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count); + } } // Disable scissor test @@ -549,24 +538,18 @@ void RasterizerOpenGL::DrawArrays() { state.Apply(); } -void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {} +void RasterizerOpenGL::FlushAll() {} -void RasterizerOpenGL::FlushAll() { - MICROPROFILE_SCOPE(OpenGL_CacheManagement); -} - -void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { - MICROPROFILE_SCOPE(OpenGL_CacheManagement); -} +void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {} void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); res_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size); + buffer_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { - MICROPROFILE_SCOPE(OpenGL_CacheManagement); InvalidateRegion(addr, size); } @@ -614,7 +597,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, void RasterizerOpenGL::SamplerInfo::Create() { sampler.Create(); mag_filter = min_filter = Tegra::Texture::TextureFilter::Linear; - wrap_u = wrap_v = Tegra::Texture::WrapMode::Wrap; + wrap_u = wrap_v = wrap_p = Tegra::Texture::WrapMode::Wrap; // default is GL_LINEAR_MIPMAP_LINEAR glSamplerParameteri(sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); @@ -622,7 +605,7 @@ void RasterizerOpenGL::SamplerInfo::Create() { } void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntry& config) { - GLuint s = sampler.handle; + const GLuint s = sampler.handle; if (mag_filter != config.mag_filter) { mag_filter = config.mag_filter; @@ -641,8 +624,13 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr wrap_v = config.wrap_v; glSamplerParameteri(s, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(wrap_v)); } + if (wrap_p != config.wrap_p) { + wrap_p = config.wrap_p; + glSamplerParameteri(s, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(wrap_p)); + } - if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border) { + if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border || + wrap_p == Tegra::Texture::WrapMode::Border) { const GLvec4 new_border_color = {{config.border_color_r, config.border_color_g, config.border_color_b, config.border_color_a}}; if (border_color != new_border_color) { @@ -652,26 +640,35 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr } } -std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_ptr, - GLintptr buffer_offset, - Maxwell::ShaderStage stage, - Shader& shader, - u32 current_bindpoint) { +u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader, + u32 current_bindpoint) { + MICROPROFILE_SCOPE(OpenGL_UBO); const auto& gpu = Core::System::GetInstance().GPU(); const auto& maxwell3d = gpu.Maxwell3D(); - const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)]; + const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)]; const auto& entries = shader->GetShaderEntries().const_buffer_entries; + constexpr u64 max_binds = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers; + std::array<GLuint, max_binds> bind_buffers; + std::array<GLintptr, max_binds> bind_offsets; + std::array<GLsizeiptr, max_binds> bind_sizes; + + ASSERT_MSG(entries.size() <= max_binds, "Exceeded expected number of binding points."); + // Upload only the enabled buffers from the 16 constbuffers of each shader stage for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { const auto& used_buffer = entries[bindpoint]; const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()]; if (!buffer.enabled) { + // With disabled buffers set values as zero to unbind them + bind_buffers[bindpoint] = 0; + bind_offsets[bindpoint] = 0; + bind_sizes[bindpoint] = 0; continue; } - size_t size = 0; + std::size_t size = 0; if (used_buffer.IsIndirect()) { // Buffer is accessed indirectly, so upload the entire thing @@ -692,26 +689,28 @@ std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(u8* buffer_pt size = Common::AlignUp(size, sizeof(GLvec4)); ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big"); - GLintptr const_buffer_offset; - std::tie(buffer_ptr, buffer_offset, const_buffer_offset) = - UploadMemory(buffer_ptr, buffer_offset, buffer.address, size, - static_cast<size_t>(uniform_buffer_alignment)); - - glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint, - stream_buffer.GetHandle(), const_buffer_offset, size); + GLintptr const_buffer_offset = buffer_cache.UploadMemory( + buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); // Now configure the bindpoint of the buffer inside the shader glUniformBlockBinding(shader->GetProgramHandle(), - shader->GetProgramResourceIndex(used_buffer.GetName()), + shader->GetProgramResourceIndex(used_buffer), current_bindpoint + bindpoint); + + // Prepare values for multibind + bind_buffers[bindpoint] = buffer_cache.GetHandle(); + bind_offsets[bindpoint] = const_buffer_offset; + bind_sizes[bindpoint] = size; } - state.Apply(); + glBindBuffersRange(GL_UNIFORM_BUFFER, current_bindpoint, static_cast<GLsizei>(entries.size()), + bind_buffers.data(), bind_offsets.data(), bind_sizes.data()); - return {buffer_ptr, buffer_offset, current_bindpoint + static_cast<u32>(entries.size())}; + return current_bindpoint + static_cast<u32>(entries.size()); } u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, u32 current_unit) { + MICROPROFILE_SCOPE(OpenGL_Texture); const auto& gpu = Core::System::GetInstance().GPU(); const auto& maxwell3d = gpu.Maxwell3D(); const auto& entries = shader->GetShaderEntries().texture_samplers; @@ -721,24 +720,25 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { const auto& entry = entries[bindpoint]; - u32 current_bindpoint = current_unit + bindpoint; + const u32 current_bindpoint = current_unit + bindpoint; // Bind the uniform to the sampler. - glProgramUniform1i(shader->GetProgramHandle(), shader->GetUniformLocation(entry.GetName()), + glProgramUniform1i(shader->GetProgramHandle(), shader->GetUniformLocation(entry), current_bindpoint); const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset()); if (!texture.enabled) { - state.texture_units[current_bindpoint].texture_2d = 0; + state.texture_units[current_bindpoint].texture = 0; continue; } texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc); Surface surface = res_cache.GetTextureSurface(texture); if (surface != nullptr) { - state.texture_units[current_bindpoint].texture_2d = surface->Texture().handle; + state.texture_units[current_bindpoint].texture = surface->Texture().handle; + state.texture_units[current_bindpoint].target = surface->Target(); state.texture_units[current_bindpoint].swizzle.r = MaxwellToGL::SwizzleSource(texture.tic.x_source); state.texture_units[current_bindpoint].swizzle.g = @@ -749,47 +749,19 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader, MaxwellToGL::SwizzleSource(texture.tic.w_source); } else { // Can occur when texture addr is null or its memory is unmapped/invalid - state.texture_units[current_bindpoint].texture_2d = 0; + state.texture_units[current_bindpoint].texture = 0; } } - state.Apply(); - return current_unit + static_cast<u32>(entries.size()); } -void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface, - const Surface& depth_surface, bool has_stencil) { - state.draw.draw_framebuffer = framebuffer.handle; - state.Apply(); - - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, - color_surface != nullptr ? color_surface->Texture().handle : 0, 0); - if (depth_surface != nullptr) { - if (has_stencil) { - // attach both depth and stencil - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->Texture().handle, 0); - } else { - // attach depth - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->Texture().handle, 0); - // clear stencil attachment - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - } - } else { - // clear both depth and stencil attachment - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); - } -} - -void RasterizerOpenGL::SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect) { +void RasterizerOpenGL::SyncViewport() { const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()}; - state.viewport.x = static_cast<GLint>(surfaces_rect.left) + viewport_rect.left; - state.viewport.y = static_cast<GLint>(surfaces_rect.bottom) + viewport_rect.bottom; + state.viewport.x = viewport_rect.left; + state.viewport.y = viewport_rect.bottom; state.viewport.width = static_cast<GLsizei>(viewport_rect.GetWidth()); state.viewport.height = static_cast<GLsizei>(viewport_rect.GetHeight()); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 30045ebff..bf9560bdc 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -6,19 +6,23 @@ #include <array> #include <cstddef> +#include <map> #include <memory> #include <tuple> #include <utility> #include <vector> #include <boost/icl/interval_map.hpp> +#include <boost/optional.hpp> #include <boost/range/iterator_range.hpp> #include <glad/glad.h> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" +#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_cache.h" @@ -42,7 +46,6 @@ public: void DrawArrays() override; void Clear() override; - void NotifyMaxwellRegisterChanged(u32 method) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; @@ -70,7 +73,7 @@ public: }; /// Maximum supported size that a constbuffer can have in bytes. - static constexpr size_t MaxConstbufferSize = 0x10000; + static constexpr std::size_t MaxConstbufferSize = 0x10000; static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0, "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); @@ -90,17 +93,20 @@ private: Tegra::Texture::TextureFilter min_filter; Tegra::Texture::WrapMode wrap_u; Tegra::Texture::WrapMode wrap_v; + Tegra::Texture::WrapMode wrap_p; GLvec4 border_color; }; - /// Configures the color and depth framebuffer states and returns the dirty <Color, Depth> - /// surfaces if writing was enabled. - std::pair<Surface, Surface> ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb, - bool preserve_contents); - - /// Binds the framebuffer color and depth surface - void BindFramebufferSurfaces(const Surface& color_surface, const Surface& depth_surface, - bool has_stencil); + /** + * Configures the color and depth framebuffer states. + * @param use_color_fb If true, configure color framebuffers. + * @param using_depth_fb If true, configure the depth/stencil framebuffer. + * @param preserve_contents If true, tries to preserve data from a previously used framebuffer. + * @param single_color_target Specifies if a single color buffer target should be used. + */ + void ConfigureFramebuffers(bool use_color_fb = true, bool using_depth_fb = true, + bool preserve_contents = true, + boost::optional<std::size_t> single_color_target = {}); /* * Configures the current constbuffers to use for the draw command. @@ -109,9 +115,8 @@ private: * @param current_bindpoint The offset at which to start counting new buffer bindpoints. * @returns The next available bindpoint for use in the next shader stage. */ - std::tuple<u8*, GLintptr, u32> SetupConstBuffers( - u8* buffer_ptr, GLintptr buffer_offset, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - Shader& shader, u32 current_bindpoint); + u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader, + u32 current_bindpoint); /* * Configures the current textures to use for the draw command. @@ -124,7 +129,7 @@ private: u32 current_unit); /// Syncs the viewport to match the guest state - void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect); + void SyncViewport(); /// Syncs the clip enabled status to match the guest state void SyncClipEnabled(); @@ -154,6 +159,7 @@ private: void SyncLogicOpState(); bool has_ARB_direct_state_access = false; + bool has_ARB_multi_bind = false; bool has_ARB_separate_shader_objects = false; bool has_ARB_vertex_attrib_binding = false; @@ -167,28 +173,23 @@ private: ScreenInfo& screen_info; std::unique_ptr<GLShader::ProgramManager> shader_program_manager; - OGLVertexArray sw_vao; - OGLVertexArray hw_vao; + std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute, + Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>, + OGLVertexArray> + vertex_array_cache; std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers; - static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; - OGLStreamBuffer stream_buffer; - OGLBuffer uniform_buffer; + static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; + OGLBufferCache buffer_cache; OGLFramebuffer framebuffer; GLint uniform_buffer_alignment; - size_t CalculateVertexArraysSize() const; - - std::pair<u8*, GLintptr> SetupVertexArrays(u8* array_ptr, GLintptr buffer_offset); - - std::pair<u8*, GLintptr> SetupShaders(u8* buffer_ptr, GLintptr buffer_offset); + std::size_t CalculateVertexArraysSize() const; - std::pair<u8*, GLintptr> AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, size_t alignment); + void SetupVertexArrays(); - std::tuple<u8*, GLintptr, GLintptr> UploadMemory(u8* buffer_ptr, GLintptr buffer_offset, - Tegra::GPUVAddr gpu_addr, size_t size, - size_t alignment = 4); + void SetupShaders(); enum class AccelDraw { Disabled, Arrays, Indexed }; AccelDraw accelerate_draw = AccelDraw::Disabled; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 1965ab7d5..86682d7cb 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -7,6 +7,7 @@ #include "common/alignment.h" #include "common/assert.h" +#include "common/logging/log.h" #include "common/microprofile.h" #include "common/scope_exit.h" #include "core/core.h" @@ -52,14 +53,30 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format)); params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format)); params.unaligned_height = config.tic.Height(); + params.target = SurfaceTargetFromTextureType(config.tic.texture_type); + + switch (params.target) { + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture2D: + params.depth = 1; + break; + case SurfaceTarget::Texture3D: + case SurfaceTarget::Texture2DArray: + params.depth = config.tic.Depth(); + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown depth for target={}", static_cast<u32>(params.target)); + UNREACHABLE(); + params.depth = 1; + break; + } + params.size_in_bytes = params.SizeInBytes(); - params.cache_width = Common::AlignUp(params.width, 16); - params.cache_height = Common::AlignUp(params.height, 16); return params; } -/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer( - const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config) { +/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(std::size_t index) { + const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]}; SurfaceParams params{}; params.addr = TryGetCpuAddr(config.Address()); params.is_tiled = true; @@ -70,9 +87,9 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.width = config.width; params.height = config.height; params.unaligned_height = config.height; + params.target = SurfaceTarget::Texture2D; + params.depth = 1; params.size_in_bytes = params.SizeInBytes(); - params.cache_width = Common::AlignUp(params.width, 16); - params.cache_height = Common::AlignUp(params.height, 16); return params; } @@ -86,13 +103,12 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) { params.pixel_format = PixelFormatFromDepthFormat(format); params.component_type = ComponentTypeFromDepthFormat(format); params.type = GetFormatType(params.pixel_format); - params.size_in_bytes = params.SizeInBytes(); params.width = zeta_width; params.height = zeta_height; params.unaligned_height = zeta_height; + params.target = SurfaceTarget::Texture2D; + params.depth = 1; params.size_in_bytes = params.SizeInBytes(); - params.cache_width = Common::AlignUp(params.width, 16); - params.cache_height = Common::AlignUp(params.height, 16); return params; } @@ -100,7 +116,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false}, // ABGR8S {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false}, // ABGR8UI - {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false}, // B5G6R5U + {GL_RGB8, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false}, // B5G6R5U {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm, false}, // A2B10G10R10U {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U @@ -151,6 +167,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false}, // RG8S {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RG32UI {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // R32UI + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8 // Depth formats {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F @@ -166,8 +183,28 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form ComponentType::Float, false}, // Z32FS8 }}; +static GLenum SurfaceTargetToGL(SurfaceParams::SurfaceTarget target) { + switch (target) { + case SurfaceParams::SurfaceTarget::Texture1D: + return GL_TEXTURE_1D; + case SurfaceParams::SurfaceTarget::Texture2D: + return GL_TEXTURE_2D; + case SurfaceParams::SurfaceTarget::Texture3D: + return GL_TEXTURE_3D; + case SurfaceParams::SurfaceTarget::Texture1DArray: + return GL_TEXTURE_1D_ARRAY; + case SurfaceParams::SurfaceTarget::Texture2DArray: + return GL_TEXTURE_2D_ARRAY; + case SurfaceParams::SurfaceTarget::TextureCubemap: + return GL_TEXTURE_CUBE_MAP; + } + LOG_CRITICAL(Render_OpenGL, "Unimplemented texture target={}", static_cast<u32>(target)); + UNREACHABLE(); + return {}; +} + static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { - ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size()); + ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); auto& format = tex_format_tuples[static_cast<unsigned int>(pixel_format)]; ASSERT(component_type == format.component_type); @@ -177,6 +214,7 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType static bool IsPixelFormatASTC(PixelFormat format) { switch (format) { case PixelFormat::ASTC_2D_4X4: + case PixelFormat::ASTC_2D_8X8: return true; default: return false; @@ -187,6 +225,8 @@ static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { switch (format) { case PixelFormat::ASTC_2D_4X4: return {4, 4}; + case PixelFormat::ASTC_2D_8X8: + return {8, 8}; default: LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format)); UNREACHABLE(); @@ -220,7 +260,8 @@ static bool IsFormatBCn(PixelFormat format) { } template <bool morton_to_gl, PixelFormat format> -void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_buffer, VAddr addr) { +void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, std::size_t gl_buffer_size, + VAddr addr) { constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT; constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format); @@ -230,18 +271,18 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_bu const u32 tile_size{IsFormatBCn(format) ? 4U : 1U}; const std::vector<u8> data = Tegra::Texture::UnswizzleTexture( addr, tile_size, bytes_per_pixel, stride, height, block_height); - const size_t size_to_copy{std::min(gl_buffer.size(), data.size())}; - gl_buffer.assign(data.begin(), data.begin() + size_to_copy); + const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())}; + memcpy(gl_buffer, data.data(), size_to_copy); } else { // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should // check the configuration for this and perform more generic un/swizzle LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!"); VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel, - Memory::GetPointer(addr), gl_buffer.data(), morton_to_gl); + Memory::GetPointer(addr), gl_buffer, morton_to_gl); } } -static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), +static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), SurfaceParams::MaxPixelFormat> morton_to_gl_fns = { // clang-format off @@ -290,6 +331,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), MortonCopy<true, PixelFormat::RG8S>, MortonCopy<true, PixelFormat::RG32UI>, MortonCopy<true, PixelFormat::R32UI>, + MortonCopy<true, PixelFormat::ASTC_2D_8X8>, MortonCopy<true, PixelFormat::Z32F>, MortonCopy<true, PixelFormat::Z16>, MortonCopy<true, PixelFormat::Z24S8>, @@ -298,7 +340,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), // clang-format on }; -static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), +static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr), SurfaceParams::MaxPixelFormat> gl_to_morton_fns = { // clang-format off @@ -349,6 +391,7 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), MortonCopy<false, PixelFormat::RG8S>, MortonCopy<false, PixelFormat::RG32UI>, MortonCopy<false, PixelFormat::R32UI>, + nullptr, MortonCopy<false, PixelFormat::Z32F>, MortonCopy<false, PixelFormat::Z16>, MortonCopy<false, PixelFormat::Z24S8>, @@ -357,33 +400,6 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, VAddr), // clang-format on }; -// Allocate an uninitialized texture of appropriate size and format for the surface -static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tuple, u32 width, - u32 height) { - OpenGLState cur_state = OpenGLState::GetCurState(); - - // Keep track of previous texture bindings - GLuint old_tex = cur_state.texture_units[0].texture_2d; - cur_state.texture_units[0].texture_2d = texture; - cur_state.Apply(); - glActiveTexture(GL_TEXTURE0); - - if (!format_tuple.compressed) { - // Only pre-create the texture for non-compressed textures. - glTexImage2D(GL_TEXTURE_2D, 0, format_tuple.internal_format, width, height, 0, - format_tuple.format, format_tuple.type, nullptr); - } - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - - // Restore previous texture bindings - cur_state.texture_units[0].texture_2d = old_tex; - cur_state.Apply(); -} - static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex, const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type, GLuint read_fb_handle, GLuint draw_fb_handle) { @@ -438,12 +454,53 @@ static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rec return true; } -CachedSurface::CachedSurface(const SurfaceParams& params) : params(params) { +CachedSurface::CachedSurface(const SurfaceParams& params) + : params(params), gl_target(SurfaceTargetToGL(params.target)) { texture.Create(); const auto& rect{params.GetRect()}; - AllocateSurfaceTexture(texture.handle, - GetFormatTuple(params.pixel_format, params.component_type), + + // Keep track of previous texture bindings + OpenGLState cur_state = OpenGLState::GetCurState(); + const auto& old_tex = cur_state.texture_units[0]; + SCOPE_EXIT({ + cur_state.texture_units[0] = old_tex; + cur_state.Apply(); + }); + + cur_state.texture_units[0].texture = texture.handle; + cur_state.texture_units[0].target = SurfaceTargetToGL(params.target); + cur_state.Apply(); + glActiveTexture(GL_TEXTURE0); + + const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type); + if (!format_tuple.compressed) { + // Only pre-create the texture for non-compressed textures. + switch (params.target) { + case SurfaceParams::SurfaceTarget::Texture1D: + glTexStorage1D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, + rect.GetWidth()); + break; + case SurfaceParams::SurfaceTarget::Texture2D: + glTexStorage2D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, rect.GetWidth(), rect.GetHeight()); + break; + case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceParams::SurfaceTarget::Texture2DArray: + glTexStorage3D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format, + rect.GetWidth(), rect.GetHeight(), params.depth); + break; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", + static_cast<u32>(params.target)); + UNREACHABLE(); + glTexStorage2D(GL_TEXTURE_2D, 1, format_tuple.internal_format, rect.GetWidth(), + rect.GetHeight()); + } + } + + glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); } static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) { @@ -461,10 +518,10 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) { S8Z24 input_pixel{}; Z24S8 output_pixel{}; - const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)}; - for (size_t y = 0; y < height; ++y) { - for (size_t x = 0; x < width; ++x) { - const size_t offset{bpp * (y * width + x)}; + constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)}; + for (std::size_t y = 0; y < height; ++y) { + for (std::size_t x = 0; x < width; ++x) { + const std::size_t offset{bpp * (y * width + x)}; std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24)); output_pixel.s8.Assign(input_pixel.s8); output_pixel.z24.Assign(input_pixel.z24); @@ -474,10 +531,10 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) { } static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) { - const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)}; - for (size_t y = 0; y < height; ++y) { - for (size_t x = 0; x < width; ++x) { - const size_t offset{bpp * (y * width + x)}; + constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)}; + for (std::size_t y = 0; y < height; ++y) { + for (std::size_t x = 0; x < width; ++x) { + const std::size_t offset{bpp * (y * width + x)}; const u8 temp{data[offset]}; data[offset] = data[offset + 1]; data[offset + 1] = temp; @@ -493,7 +550,8 @@ static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) { static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format, u32 width, u32 height) { switch (pixel_format) { - case PixelFormat::ASTC_2D_4X4: { + case PixelFormat::ASTC_2D_4X4: + case PixelFormat::ASTC_2D_8X8: { // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC. u32 block_width{}; u32 block_height{}; @@ -514,23 +572,6 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma } } -/** - * Helper function to perform software conversion (as needed) when flushing a buffer to Switch - * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with - * typical desktop GPUs. - */ -static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& /*data*/, PixelFormat pixel_format, - u32 /*width*/, u32 /*height*/) { - switch (pixel_format) { - case PixelFormat::ASTC_2D_4X4: - case PixelFormat::S8Z24: - LOG_CRITICAL(Render_OpenGL, "Unimplemented pixel_format={}", - static_cast<u32>(pixel_format)); - UNREACHABLE(); - break; - } -} - MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192)); void CachedSurface::LoadGLBuffer() { ASSERT(params.type != SurfaceType::Fill); @@ -545,13 +586,25 @@ void CachedSurface::LoadGLBuffer() { MICROPROFILE_SCOPE(OpenGL_SurfaceLoad); if (params.is_tiled) { - gl_buffer.resize(copy_size); + // TODO(bunnei): This only unswizzles and copies a 2D texture - we do not yet know how to do + // this for 3D textures, etc. + switch (params.target) { + case SurfaceParams::SurfaceTarget::Texture2D: + // Pass impl. to the fallback code below + break; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented tiled load for target={}", + static_cast<u32>(params.target)); + UNREACHABLE(); + } - morton_to_gl_fns[static_cast<size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, gl_buffer, params.addr); + gl_buffer.resize(static_cast<std::size_t>(params.depth) * copy_size); + morton_to_gl_fns[static_cast<std::size_t>(params.pixel_format)]( + params.width, params.block_height, params.height, gl_buffer.data(), copy_size, + params.addr); } else { - const u8* const texture_src_data_end = texture_src_data + copy_size; - + const u8* const texture_src_data_end{texture_src_data + + (static_cast<std::size_t>(params.depth) * copy_size)}; gl_buffer.assign(texture_src_data, texture_src_data_end); } @@ -560,23 +613,7 @@ void CachedSurface::LoadGLBuffer() { MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64)); void CachedSurface::FlushGLBuffer() { - u8* const dst_buffer = Memory::GetPointer(params.addr); - - ASSERT(dst_buffer); - ASSERT(gl_buffer.size() == - params.width * params.height * GetGLBytesPerPixel(params.pixel_format)); - - MICROPROFILE_SCOPE(OpenGL_SurfaceFlush); - - ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width, - params.height); - - if (!params.is_tiled) { - std::memcpy(dst_buffer, gl_buffer.data(), params.size_in_bytes); - } else { - gl_to_morton_fns[static_cast<size_t>(params.pixel_format)]( - params.width, params.block_height, params.height, gl_buffer, params.addr); - } + ASSERT_MSG(false, "Unimplemented"); } MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192)); @@ -586,22 +623,30 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle MICROPROFILE_SCOPE(OpenGL_TextureUL); - ASSERT(gl_buffer.size() == - params.width * params.height * GetGLBytesPerPixel(params.pixel_format)); + ASSERT(gl_buffer.size() == static_cast<std::size_t>(params.width) * params.height * + GetGLBytesPerPixel(params.pixel_format) * params.depth); const auto& rect{params.GetRect()}; // Load data from memory to the surface - GLint x0 = static_cast<GLint>(rect.left); - GLint y0 = static_cast<GLint>(rect.bottom); - size_t buffer_offset = (y0 * params.width + x0) * GetGLBytesPerPixel(params.pixel_format); + const GLint x0 = static_cast<GLint>(rect.left); + const GLint y0 = static_cast<GLint>(rect.bottom); + const std::size_t buffer_offset = + static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.width + + static_cast<std::size_t>(x0)) * + GetGLBytesPerPixel(params.pixel_format); const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - GLuint target_tex = texture.handle; + const GLuint target_tex = texture.handle; OpenGLState cur_state = OpenGLState::GetCurState(); - GLuint old_tex = cur_state.texture_units[0].texture_2d; - cur_state.texture_units[0].texture_2d = target_tex; + const auto& old_tex = cur_state.texture_units[0]; + SCOPE_EXIT({ + cur_state.texture_units[0] = old_tex; + cur_state.Apply(); + }); + cur_state.texture_units[0].texture = target_tex; + cur_state.texture_units[0].target = SurfaceTargetToGL(params.target); cur_state.Apply(); // Ensure no bad interactions with GL_UNPACK_ALIGNMENT @@ -610,136 +655,102 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle glActiveTexture(GL_TEXTURE0); if (tuple.compressed) { - glCompressedTexImage2D( - GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width), - static_cast<GLsizei>(params.height), 0, static_cast<GLsizei>(params.size_in_bytes), - &gl_buffer[buffer_offset]); + switch (params.target) { + case SurfaceParams::SurfaceTarget::Texture2D: + glCompressedTexImage2D( + SurfaceTargetToGL(params.target), 0, tuple.internal_format, + static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), 0, + static_cast<GLsizei>(params.size_in_bytes), &gl_buffer[buffer_offset]); + break; + case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceParams::SurfaceTarget::Texture2DArray: + glCompressedTexImage3D( + SurfaceTargetToGL(params.target), 0, tuple.internal_format, + static_cast<GLsizei>(params.width), static_cast<GLsizei>(params.height), + static_cast<GLsizei>(params.depth), 0, static_cast<GLsizei>(params.size_in_bytes), + &gl_buffer[buffer_offset]); + break; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", + static_cast<u32>(params.target)); + UNREACHABLE(); + glCompressedTexImage2D( + GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width), + static_cast<GLsizei>(params.height), 0, static_cast<GLsizei>(params.size_in_bytes), + &gl_buffer[buffer_offset]); + } } else { - glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()), - static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, - &gl_buffer[buffer_offset]); - } - - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); - - cur_state.texture_units[0].texture_2d = old_tex; - cur_state.Apply(); -} - -MICROPROFILE_DEFINE(OpenGL_TextureDL, "OpenGL", "Texture Download", MP_RGB(128, 192, 64)); -void CachedSurface::DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) { - if (params.type == SurfaceType::Fill) - return; - - MICROPROFILE_SCOPE(OpenGL_TextureDL); - - gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format)); - - OpenGLState state = OpenGLState::GetCurState(); - OpenGLState prev_state = state; - SCOPE_EXIT({ prev_state.Apply(); }); - - const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - - // Ensure no bad interactions with GL_PACK_ALIGNMENT - ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0); - glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width)); - - const auto& rect{params.GetRect()}; - size_t buffer_offset = - (rect.bottom * params.width + rect.left) * GetGLBytesPerPixel(params.pixel_format); - - state.UnbindTexture(texture.handle); - state.draw.read_framebuffer = read_fb_handle; - state.Apply(); - if (params.type == SurfaceType::ColorTexture) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, - texture.handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); - } else if (params.type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - texture.handle, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - } else { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - texture.handle, 0); + switch (params.target) { + case SurfaceParams::SurfaceTarget::Texture1D: + glTexSubImage1D(SurfaceTargetToGL(params.target), 0, x0, + static_cast<GLsizei>(rect.GetWidth()), tuple.format, tuple.type, + &gl_buffer[buffer_offset]); + break; + case SurfaceParams::SurfaceTarget::Texture2D: + glTexSubImage2D(SurfaceTargetToGL(params.target), 0, x0, y0, + static_cast<GLsizei>(rect.GetWidth()), + static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, + &gl_buffer[buffer_offset]); + break; + case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceParams::SurfaceTarget::Texture2DArray: + glTexSubImage3D(SurfaceTargetToGL(params.target), 0, x0, y0, 0, + static_cast<GLsizei>(rect.GetWidth()), + static_cast<GLsizei>(rect.GetHeight()), params.depth, tuple.format, + tuple.type, &gl_buffer[buffer_offset]); + break; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", + static_cast<u32>(params.target)); + UNREACHABLE(); + glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()), + static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type, + &gl_buffer[buffer_offset]); + } } - glReadPixels(static_cast<GLint>(rect.left), static_cast<GLint>(rect.bottom), - static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()), - tuple.format, tuple.type, &gl_buffer[buffer_offset]); - glPixelStorei(GL_PACK_ROW_LENGTH, 0); + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); } RasterizerCacheOpenGL::RasterizerCacheOpenGL() { read_framebuffer.Create(); draw_framebuffer.Create(); + copy_pbo.Create(); } Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextureInfo& config) { return GetSurface(SurfaceParams::CreateForTexture(config)); } -SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(bool using_color_fb, - bool using_depth_fb, - bool preserve_contents) { - const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs; +Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) { + const auto& regs{Core::System::GetInstance().GPU().Maxwell3D().regs}; + if (!regs.zeta.Address() || !regs.zeta_enable) { + return {}; + } - // TODO(bunnei): This is hard corded to use just the first render buffer - LOG_WARNING(Render_OpenGL, "hard-coded for render target 0!"); + SurfaceParams depth_params{SurfaceParams::CreateForDepthBuffer( + regs.zeta_width, regs.zeta_height, regs.zeta.Address(), regs.zeta.format)}; - // get color and depth surfaces - SurfaceParams color_params{}; - SurfaceParams depth_params{}; + return GetSurface(depth_params, preserve_contents); +} - if (using_color_fb) { - color_params = SurfaceParams::CreateForFramebuffer(regs.rt[0]); - } +Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool preserve_contents) { + const auto& regs{Core::System::GetInstance().GPU().Maxwell3D().regs}; - if (using_depth_fb) { - depth_params = SurfaceParams::CreateForDepthBuffer(regs.zeta_width, regs.zeta_height, - regs.zeta.Address(), regs.zeta.format); - } + ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); - MathUtil::Rectangle<u32> color_rect{}; - Surface color_surface; - if (using_color_fb) { - color_surface = GetSurface(color_params, preserve_contents); - if (color_surface) { - color_rect = color_surface->GetSurfaceParams().GetRect(); - } + if (index >= regs.rt_control.count) { + return {}; } - MathUtil::Rectangle<u32> depth_rect{}; - Surface depth_surface; - if (using_depth_fb) { - depth_surface = GetSurface(depth_params, preserve_contents); - if (depth_surface) { - depth_rect = depth_surface->GetSurfaceParams().GetRect(); - } + if (regs.rt[index].Address() == 0 || regs.rt[index].format == Tegra::RenderTargetFormat::NONE) { + return {}; } - MathUtil::Rectangle<u32> fb_rect{}; - if (color_surface && depth_surface) { - fb_rect = color_rect; - // Color and Depth surfaces must have the same dimensions and offsets - if (color_rect.bottom != depth_rect.bottom || color_rect.top != depth_rect.top || - color_rect.left != depth_rect.left || color_rect.right != depth_rect.right) { - color_surface = GetSurface(color_params); - depth_surface = GetSurface(depth_params); - fb_rect = color_surface->GetSurfaceParams().GetRect(); - } - } else if (color_surface) { - fb_rect = color_rect; - } else if (depth_surface) { - fb_rect = depth_rect; - } + const SurfaceParams color_params{SurfaceParams::CreateForFramebuffer(index)}; - return std::make_tuple(color_surface, depth_surface, fb_rect); + return GetSurface(color_params, preserve_contents); } void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) { @@ -748,7 +759,6 @@ void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) { } void RasterizerCacheOpenGL::FlushSurface(const Surface& surface) { - surface->DownloadGLTexture(read_framebuffer.handle, draw_framebuffer.handle); surface->FlushGLBuffer(); } @@ -806,27 +816,26 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface, // Get a new surface with the new parameters, and blit the previous surface to it Surface new_surface{GetUncachedSurface(new_params)}; - // If format is unchanged, we can do a faster blit without reinterpreting pixel data - if (params.pixel_format == new_params.pixel_format) { + if (params.pixel_format == new_params.pixel_format || + !Settings::values.use_accurate_framebuffers) { + // If the format is the same, just do a framebuffer blit. This is significantly faster than + // using PBOs. The is also likely less accurate, as textures will be converted rather than + // reinterpreted. + BlitTextures(surface->Texture().handle, params.GetRect(), new_surface->Texture().handle, - new_surface->GetSurfaceParams().GetRect(), params.type, - read_framebuffer.handle, draw_framebuffer.handle); - return new_surface; - } + params.GetRect(), params.type, read_framebuffer.handle, + draw_framebuffer.handle); + } else { + // When use_accurate_framebuffers setting is enabled, perform a more accurate surface copy, + // where pixels are reinterpreted as a new format (without conversion). This code path uses + // OpenGL PBOs and is quite slow. - // When using accurate framebuffers, always copy old data to new surface, regardless of format - if (Settings::values.use_accurate_framebuffers) { auto source_format = GetFormatTuple(params.pixel_format, params.component_type); auto dest_format = GetFormatTuple(new_params.pixel_format, new_params.component_type); - size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes()); + std::size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes()); - // Use a Pixel Buffer Object to download the previous texture and then upload it to the new - // one using the new format. - OGLBuffer pbo; - pbo.Create(); - - glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo.handle); + glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo.handle); glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW_ARB); if (source_format.compressed) { glGetCompressedTextureImage(surface->Texture().handle, 0, @@ -845,10 +854,10 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface, // of the data in this case. Games like Super Mario Odyssey seem to hit this case // when drawing, it re-uses the memory of a previous texture as a bigger framebuffer // but it doesn't clear it beforehand, the texture is already full of zeros. - LOG_CRITICAL(HW_GPU, "Trying to upload extra texture data from the CPU during " - "reinterpretation but the texture is tiled."); + LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during " + "reinterpretation but the texture is tiled."); } - size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes(); + std::size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes(); std::vector<u8> data(remaining_size); Memory::ReadBlock(new_params.addr + params.SizeInBytes(), data.data(), data.size()); glBufferSubData(GL_PIXEL_PACK_BUFFER, params.SizeInBytes(), remaining_size, @@ -859,21 +868,38 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface, const auto& dest_rect{new_params.GetRect()}; - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo.handle); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo.handle); if (dest_format.compressed) { - glCompressedTexSubImage2D( - GL_TEXTURE_2D, 0, 0, 0, static_cast<GLsizei>(dest_rect.GetWidth()), - static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format, - static_cast<GLsizei>(new_params.SizeInBytes()), nullptr); + LOG_CRITICAL(HW_GPU, "Compressed copy is unimplemented!"); + UNREACHABLE(); } else { - glTextureSubImage2D(new_surface->Texture().handle, 0, 0, 0, - static_cast<GLsizei>(dest_rect.GetWidth()), - static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format, - dest_format.type, nullptr); + switch (new_params.target) { + case SurfaceParams::SurfaceTarget::Texture1D: + glTextureSubImage1D(new_surface->Texture().handle, 0, 0, + static_cast<GLsizei>(dest_rect.GetWidth()), dest_format.format, + dest_format.type, nullptr); + break; + case SurfaceParams::SurfaceTarget::Texture2D: + glTextureSubImage2D(new_surface->Texture().handle, 0, 0, 0, + static_cast<GLsizei>(dest_rect.GetWidth()), + static_cast<GLsizei>(dest_rect.GetHeight()), dest_format.format, + dest_format.type, nullptr); + break; + case SurfaceParams::SurfaceTarget::Texture3D: + case SurfaceParams::SurfaceTarget::Texture2DArray: + glTextureSubImage3D(new_surface->Texture().handle, 0, 0, 0, 0, + static_cast<GLsizei>(dest_rect.GetWidth()), + static_cast<GLsizei>(dest_rect.GetHeight()), + static_cast<GLsizei>(new_params.depth), dest_format.format, + dest_format.type, nullptr); + break; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", + static_cast<u32>(params.target)); + UNREACHABLE(); + } } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - pbo.Release(); } return new_surface; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index aad75f200..d7a4bc37f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -70,19 +70,20 @@ struct SurfaceParams { RG8S = 42, RG32UI = 43, R32UI = 44, + ASTC_2D_8X8 = 45, MaxColorFormat, // Depth formats - Z32F = 45, - Z16 = 46, + Z32F = 46, + Z16 = 47, MaxDepthFormat, // DepthStencil formats - Z24S8 = 47, - S8Z24 = 48, - Z32FS8 = 49, + Z24S8 = 48, + S8Z24 = 49, + Z32FS8 = 50, MaxDepthStencilFormat, @@ -90,7 +91,7 @@ struct SurfaceParams { Invalid = 255, }; - static constexpr size_t MaxPixelFormat = static_cast<size_t>(PixelFormat::Max); + static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); enum class ComponentType { Invalid = 0, @@ -109,6 +110,33 @@ struct SurfaceParams { Invalid = 4, }; + enum class SurfaceTarget { + Texture1D, + Texture2D, + Texture3D, + Texture1DArray, + Texture2DArray, + TextureCubemap, + }; + + static SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type) { + switch (texture_type) { + case Tegra::Texture::TextureType::Texture1D: + return SurfaceTarget::Texture1D; + case Tegra::Texture::TextureType::Texture2D: + case Tegra::Texture::TextureType::Texture2DNoMipmap: + return SurfaceTarget::Texture2D; + case Tegra::Texture::TextureType::Texture1DArray: + return SurfaceTarget::Texture1DArray; + case Tegra::Texture::TextureType::Texture2DArray: + return SurfaceTarget::Texture2DArray; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type)); + UNREACHABLE(); + return SurfaceTarget::Texture2D; + } + } + /** * Gets the compression factor for the specified PixelFormat. This applies to just the * "compressed width" and "compressed height", not the overall compression factor of a @@ -165,6 +193,7 @@ struct SurfaceParams { 1, // RG8S 1, // RG32UI 1, // R32UI + 4, // ASTC_2D_8X8 1, // Z32F 1, // Z16 1, // Z24S8 @@ -172,8 +201,8 @@ struct SurfaceParams { 1, // Z32FS8 }}; - ASSERT(static_cast<size_t>(format) < compression_factor_table.size()); - return compression_factor_table[static_cast<size_t>(format)]; + ASSERT(static_cast<std::size_t>(format) < compression_factor_table.size()); + return compression_factor_table[static_cast<std::size_t>(format)]; } static constexpr u32 GetFormatBpp(PixelFormat format) { @@ -226,6 +255,7 @@ struct SurfaceParams { 16, // RG8S 64, // RG32UI 32, // R32UI + 16, // ASTC_2D_8X8 32, // Z32F 16, // Z16 32, // Z24S8 @@ -233,8 +263,8 @@ struct SurfaceParams { 64, // Z32FS8 }}; - ASSERT(static_cast<size_t>(format) < bpp_table.size()); - return bpp_table[static_cast<size_t>(format)]; + ASSERT(static_cast<std::size_t>(format) < bpp_table.size()); + return bpp_table[static_cast<std::size_t>(format)]; } u32 GetFormatBpp() const { @@ -270,6 +300,7 @@ struct SurfaceParams { return PixelFormat::ABGR8S; case Tegra::RenderTargetFormat::RGBA8_UINT: return PixelFormat::ABGR8UI; + case Tegra::RenderTargetFormat::BGRA8_SRGB: case Tegra::RenderTargetFormat::BGRA8_UNORM: return PixelFormat::BGRA8; case Tegra::RenderTargetFormat::RGB10_A2_UNORM: @@ -288,6 +319,8 @@ struct SurfaceParams { return PixelFormat::R11FG11FB10F; case Tegra::RenderTargetFormat::B5G6R5_UNORM: return PixelFormat::B5G6R5U; + case Tegra::RenderTargetFormat::BGR5A1_UNORM: + return PixelFormat::A1B5G5R5U; case Tegra::RenderTargetFormat::RGBA32_UINT: return PixelFormat::RGBA32UI; case Tegra::RenderTargetFormat::R8_UNORM: @@ -494,6 +527,8 @@ struct SurfaceParams { return PixelFormat::BC6H_SF16; case Tegra::Texture::TextureFormat::ASTC_2D_4X4: return PixelFormat::ASTC_2D_4X4; + case Tegra::Texture::TextureFormat::ASTC_2D_8X8: + return PixelFormat::ASTC_2D_8X8; case Tegra::Texture::TextureFormat::R16_G16: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: @@ -542,11 +577,13 @@ struct SurfaceParams { case Tegra::RenderTargetFormat::RGBA8_UNORM: case Tegra::RenderTargetFormat::RGBA8_SRGB: case Tegra::RenderTargetFormat::BGRA8_UNORM: + case Tegra::RenderTargetFormat::BGRA8_SRGB: case Tegra::RenderTargetFormat::RGB10_A2_UNORM: case Tegra::RenderTargetFormat::R8_UNORM: case Tegra::RenderTargetFormat::RG16_UNORM: case Tegra::RenderTargetFormat::R16_UNORM: case Tegra::RenderTargetFormat::B5G6R5_UNORM: + case Tegra::RenderTargetFormat::BGR5A1_UNORM: case Tegra::RenderTargetFormat::RG8_UNORM: case Tegra::RenderTargetFormat::RGBA16_UNORM: return ComponentType::UNorm; @@ -607,16 +644,18 @@ struct SurfaceParams { } static SurfaceType GetFormatType(PixelFormat pixel_format) { - if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxColorFormat)) { + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxColorFormat)) { return SurfaceType::ColorTexture; } - if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxDepthFormat)) { + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) { return SurfaceType::Depth; } - if (static_cast<size_t>(pixel_format) < - static_cast<size_t>(PixelFormat::MaxDepthStencilFormat)) { + if (static_cast<std::size_t>(pixel_format) < + static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) { return SurfaceType::DepthStencil; } @@ -630,20 +669,19 @@ struct SurfaceParams { MathUtil::Rectangle<u32> GetRect() const; /// Returns the size of this surface in bytes, adjusted for compression - size_t SizeInBytes() const { + std::size_t SizeInBytes() const { const u32 compression_factor{GetCompressionFactor(pixel_format)}; ASSERT(width % compression_factor == 0); ASSERT(height % compression_factor == 0); return (width / compression_factor) * (height / compression_factor) * - GetFormatBpp(pixel_format) / CHAR_BIT; + GetFormatBpp(pixel_format) * depth / CHAR_BIT; } /// Creates SurfaceParams from a texture configuration static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config); /// Creates SurfaceParams from a framebuffer configuration - static SurfaceParams CreateForFramebuffer( - const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config); + static SurfaceParams CreateForFramebuffer(std::size_t index); /// Creates SurfaceParams for a depth buffer configuration static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height, @@ -652,8 +690,8 @@ struct SurfaceParams { /// Checks if surfaces are compatible for caching bool IsCompatibleSurface(const SurfaceParams& other) const { - return std::tie(pixel_format, type, cache_width, cache_height) == - std::tie(other.pixel_format, other.type, other.cache_width, other.cache_height); + return std::tie(pixel_format, type, width, height) == + std::tie(other.pixel_format, other.type, other.width, other.height); } VAddr addr; @@ -664,12 +702,10 @@ struct SurfaceParams { SurfaceType type; u32 width; u32 height; + u32 depth; u32 unaligned_height; - size_t size_in_bytes; - - // Parameters used for caching only - u32 cache_width; - u32 cache_height; + std::size_t size_in_bytes; + SurfaceTarget target; }; }; // namespace OpenGL @@ -685,7 +721,7 @@ struct SurfaceReserveKey : Common::HashableStruct<OpenGL::SurfaceParams> { namespace std { template <> struct hash<SurfaceReserveKey> { - size_t operator()(const SurfaceReserveKey& k) const { + std::size_t operator()(const SurfaceReserveKey& k) const { return k.Hash(); } }; @@ -701,7 +737,7 @@ public: return params.addr; } - size_t GetSizeInBytes() const { + std::size_t GetSizeInBytes() const { return params.size_in_bytes; } @@ -709,6 +745,10 @@ public: return texture; } + GLenum Target() const { + return gl_target; + } + static constexpr unsigned int GetGLBytesPerPixel(SurfaceParams::PixelFormat format) { if (format == SurfaceParams::PixelFormat::Invalid) return 0; @@ -724,14 +764,14 @@ public: void LoadGLBuffer(); void FlushGLBuffer(); - // Upload/Download data in gl_buffer in/to this surface's texture + // Upload data in gl_buffer to this surface's texture void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle); - void DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle); private: OGLTexture texture; std::vector<u8> gl_buffer; SurfaceParams params; + GLenum gl_target; }; class RasterizerCacheOpenGL final : public RasterizerCache<Surface> { @@ -741,9 +781,11 @@ public: /// Get a surface based on the texture configuration Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config); - /// Get the color and depth surfaces based on the framebuffer configuration - SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb, - bool preserve_contents); + /// Get the depth surface based on the framebuffer configuration + Surface GetDepthBufferSurface(bool preserve_contents); + + /// Get the color surface based on the framebuffer configuration and the specified render target + Surface GetColorBufferSurface(std::size_t index, bool preserve_contents); /// Flushes the surface to Switch memory void FlushSurface(const Surface& surface); @@ -774,6 +816,10 @@ private: OGLFramebuffer read_framebuffer; OGLFramebuffer draw_framebuffer; + + /// Use a Pixel Buffer Object to download the previous texture and then upload it to the new one + /// using the new format. + OGLBuffer copy_pbo; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index ac9adfd83..894fe6eae 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -13,8 +13,8 @@ namespace OpenGL { /// Gets the address for the specified shader stage program static VAddr GetShaderAddress(Maxwell::ShaderProgram program) { - auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); - auto& shader_config = gpu.regs.shader_config[static_cast<size_t>(program)]; + const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); + const auto& shader_config = gpu.regs.shader_config[static_cast<std::size_t>(program)]; return *gpu.memory_manager.GpuToCpuAddress(gpu.regs.code_address.CodeAddress() + shader_config.offset); } @@ -28,7 +28,7 @@ static GLShader::ProgramCode GetShaderCode(VAddr addr) { /// Helper function to set shader uniform block bindings for a single shader stage static void SetShaderUniformBlockBinding(GLuint shader, const char* name, - Maxwell::ShaderStage binding, size_t expected_size) { + Maxwell::ShaderStage binding, std::size_t expected_size) { const GLuint ub_index = glGetUniformBlockIndex(shader, name); if (ub_index == GL_INVALID_INDEX) { return; @@ -36,7 +36,7 @@ static void SetShaderUniformBlockBinding(GLuint shader, const char* name, GLint ub_size = 0; glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size); - ASSERT_MSG(static_cast<size_t>(ub_size) == expected_size, + ASSERT_MSG(static_cast<std::size_t>(ub_size) == expected_size, "Uniform block size did not match! Got {}, expected {}", ub_size, expected_size); glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding)); } @@ -85,23 +85,23 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type) SetShaderUniformBlockBindings(program.handle); } -GLuint CachedShader::GetProgramResourceIndex(const std::string& name) { - auto search{resource_cache.find(name)}; +GLuint CachedShader::GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer) { + const auto search{resource_cache.find(buffer.GetHash())}; if (search == resource_cache.end()) { const GLuint index{ - glGetProgramResourceIndex(program.handle, GL_UNIFORM_BLOCK, name.c_str())}; - resource_cache[name] = index; + glGetProgramResourceIndex(program.handle, GL_UNIFORM_BLOCK, buffer.GetName().c_str())}; + resource_cache[buffer.GetHash()] = index; return index; } return search->second; } -GLint CachedShader::GetUniformLocation(const std::string& name) { - auto search{uniform_cache.find(name)}; +GLint CachedShader::GetUniformLocation(const GLShader::SamplerEntry& sampler) { + const auto search{uniform_cache.find(sampler.GetHash())}; if (search == uniform_cache.end()) { - const GLint index{glGetUniformLocation(program.handle, name.c_str())}; - uniform_cache[name] = index; + const GLint index{glGetUniformLocation(program.handle, sampler.GetName().c_str())}; + uniform_cache[sampler.GetHash()] = index; return index; } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 759987604..9bafe43a9 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -4,8 +4,8 @@ #pragma once +#include <map> #include <memory> -#include <unordered_map> #include "common/common_types.h" #include "video_core/rasterizer_cache.h" @@ -28,7 +28,7 @@ public: } /// Gets the size of the shader in guest memory, required for cache management - size_t GetSizeInBytes() const { + std::size_t GetSizeInBytes() const { return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64); } @@ -43,10 +43,10 @@ public: } /// Gets the GL program resource location for the specified resource, caching as needed - GLuint GetProgramResourceIndex(const std::string& name); + GLuint GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer); /// Gets the GL uniform location for the specified resource, caching as needed - GLint GetUniformLocation(const std::string& name); + GLint GetUniformLocation(const GLShader::SamplerEntry& sampler); private: VAddr addr; @@ -55,8 +55,8 @@ private: GLShader::ShaderEntries entries; OGLProgram program; - std::unordered_map<std::string, GLuint> resource_cache; - std::unordered_map<std::string, GLint> uniform_cache; + std::map<u32, GLuint> resource_cache; + std::map<u32, GLint> uniform_cache; }; class ShaderCacheOpenGL final : public RasterizerCache<Shader> { diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 391c92d47..b3e95187e 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -12,6 +12,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "video_core/engines/shader_bytecode.h" +#include "video_core/engines/shader_header.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" @@ -26,7 +27,7 @@ using Tegra::Shader::Sampler; using Tegra::Shader::SubOp; constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH; -constexpr u32 PROGRAM_HEADER_SIZE = 0x50; +constexpr u32 PROGRAM_HEADER_SIZE = sizeof(Tegra::Shader::Header); class DecompileFail : public std::runtime_error { public: @@ -113,7 +114,7 @@ private: /// Scans a range of code for labels and determines the exit method. ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels) { - auto [iter, inserted] = + const auto [iter, inserted] = exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined); ExitMethod& exit_method = iter->second; if (!inserted) @@ -131,22 +132,22 @@ private: if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) { return exit_method = ExitMethod::AlwaysEnd; } else { - ExitMethod not_met = Scan(offset + 1, end, labels); + const ExitMethod not_met = Scan(offset + 1, end, labels); return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met); } } case OpCode::Id::BRA: { - u32 target = offset + instr.bra.GetBranchTarget(); + const u32 target = offset + instr.bra.GetBranchTarget(); labels.insert(target); - ExitMethod no_jmp = Scan(offset + 1, end, labels); - ExitMethod jmp = Scan(target, end, labels); + const ExitMethod no_jmp = Scan(offset + 1, end, labels); + const ExitMethod jmp = Scan(target, end, labels); return exit_method = ParallelExit(no_jmp, jmp); } case OpCode::Id::SSY: { // The SSY instruction uses a similar encoding as the BRA instruction. ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported"); - u32 target = offset + instr.bra.GetBranchTarget(); + const u32 target = offset + instr.bra.GetBranchTarget(); labels.insert(target); // Continue scanning for an exit method. break; @@ -189,7 +190,7 @@ public: private: void AppendIndentation() { - shader_source.append(static_cast<size_t>(scope) * 4, ' '); + shader_source.append(static_cast<std::size_t>(scope) * 4, ' '); } std::string shader_source; @@ -208,7 +209,7 @@ public: UnsignedInteger, }; - GLSLRegister(size_t index, const std::string& suffix) : index{index}, suffix{suffix} {} + GLSLRegister(std::size_t index, const std::string& suffix) : index{index}, suffix{suffix} {} /// Gets the GLSL type string for a register static std::string GetTypeString() { @@ -226,15 +227,23 @@ public: } /// Returns the index of the register - size_t GetIndex() const { + std::size_t GetIndex() const { return index; } private: - const size_t index; + const std::size_t index; const std::string& suffix; }; +enum class InternalFlag : u64 { + ZeroFlag = 0, + CarryFlag = 1, + OverflowFlag = 2, + NaNFlag = 3, + Amount +}; + /** * Used to manage shader registers that are emulated with GLSL. This class keeps track of the state * of all registers (e.g. whether they are currently being used as Floats or Integers), and @@ -247,6 +256,7 @@ public: const Maxwell3D::Regs::ShaderStage& stage, const std::string& suffix) : shader{shader}, declarations{declarations}, stage{stage}, suffix{suffix} { BuildRegisterList(); + BuildInputList(); } /** @@ -327,13 +337,19 @@ public: void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem, const std::string& value, u64 dest_num_components, u64 value_num_components, bool is_saturated = false, - u64 dest_elem = 0, Register::Size size = Register::Size::Word) { + u64 dest_elem = 0, Register::Size size = Register::Size::Word, + bool sets_cc = false) { ASSERT_MSG(!is_saturated, "Unimplemented"); const std::string func{is_signed ? "intBitsToFloat" : "uintBitsToFloat"}; SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')', dest_num_components, value_num_components, dest_elem); + + if (sets_cc) { + const std::string zero_condition = "( " + ConvertIntegerSize(value, size) + " == 0 )"; + SetInternalFlag(InternalFlag::ZeroFlag, zero_condition); + } } /** @@ -343,12 +359,33 @@ public: * @param elem The element to use for the operation. * @param attribute The input attribute to use as the source value. */ - void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute) { - std::string dest = GetRegisterAsFloat(reg); - std::string src = GetInputAttribute(attribute) + GetSwizzle(elem); + void SetRegisterToInputAttibute(const Register& reg, u64 elem, Attribute::Index attribute, + const Tegra::Shader::IpaMode& input_mode) { + const std::string dest = GetRegisterAsFloat(reg); + const std::string src = GetInputAttribute(attribute, input_mode) + GetSwizzle(elem); shader.AddLine(dest + " = " + src + ';'); } + std::string GetControlCode(const Tegra::Shader::ControlCode cc) const { + switch (cc) { + case Tegra::Shader::ControlCode::NEU: + return "!(" + GetInternalFlag(InternalFlag::ZeroFlag) + ')'; + default: + LOG_CRITICAL(HW_GPU, "Unimplemented Control Code {}", static_cast<u32>(cc)); + UNREACHABLE(); + return "false"; + } + } + + std::string GetInternalFlag(const InternalFlag ii) const { + const u32 code = static_cast<u32>(ii); + return "internalFlag_" + std::to_string(code) + suffix; + } + + void SetInternalFlag(const InternalFlag ii, const std::string& value) const { + shader.AddLine(GetInternalFlag(ii) + " = " + value + ';'); + } + /** * Writes code that does a output attribute assignment to register operation. Output attributes * are stored as floats, so this may require conversion. @@ -357,8 +394,8 @@ public: * @param reg The register to use as the source value. */ void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& reg) { - std::string dest = GetOutputAttribute(attribute); - std::string src = GetRegisterAsFloat(reg); + const std::string dest = GetOutputAttribute(attribute); + const std::string src = GetRegisterAsFloat(reg); if (!dest.empty()) { // Can happen with unknown/unimplemented output attributes, in which case we ignore the @@ -391,9 +428,9 @@ public: GLSLRegister::Type type) { declr_const_buffers[cbuf_index].MarkAsUsedIndirect(cbuf_index, stage); - std::string final_offset = fmt::format("({} + {})", index_str, offset / 4); - std::string value = 'c' + std::to_string(cbuf_index) + '[' + final_offset + " / 4][" + - final_offset + " % 4]"; + const std::string final_offset = fmt::format("({} + {})", index_str, offset / 4); + const std::string value = 'c' + std::to_string(cbuf_index) + '[' + final_offset + " / 4][" + + final_offset + " % 4]"; if (type == GLSLRegister::Type::Float) { return value; @@ -412,12 +449,19 @@ public: } declarations.AddNewLine(); - for (const auto& index : declr_input_attribute) { + for (u32 ii = 0; ii < static_cast<u64>(InternalFlag::Amount); ii++) { + const InternalFlag code = static_cast<InternalFlag>(ii); + declarations.AddLine("bool " + GetInternalFlag(code) + " = false;"); + } + declarations.AddNewLine(); + + for (const auto element : declr_input_attribute) { // TODO(bunnei): Use proper number of elements for these - declarations.AddLine("layout(location = " + - std::to_string(static_cast<u32>(index) - - static_cast<u32>(Attribute::Index::Attribute_0)) + - ") in vec4 " + GetInputAttribute(index) + ';'); + u32 idx = + static_cast<u32>(element.first) - static_cast<u32>(Attribute::Index::Attribute_0); + declarations.AddLine("layout(location = " + std::to_string(idx) + ")" + + GetInputFlags(element.first) + "in vec4 " + + GetInputAttribute(element.first, element.second) + ';'); } declarations.AddNewLine(); @@ -440,13 +484,12 @@ public: } declarations.AddNewLine(); - // Append the sampler2D array for the used textures. - size_t num_samplers = GetSamplers().size(); - if (num_samplers > 0) { - declarations.AddLine("uniform sampler2D " + SamplerEntry::GetArrayName(stage) + '[' + - std::to_string(num_samplers) + "];"); - declarations.AddNewLine(); + const auto& samplers = GetSamplers(); + for (const auto& sampler : samplers) { + declarations.AddLine("uniform " + sampler.GetTypeString() + ' ' + sampler.GetName() + + ';'); } + declarations.AddNewLine(); } /// Returns a list of constant buffer declarations @@ -458,27 +501,29 @@ public: } /// Returns a list of samplers used in the shader - std::vector<SamplerEntry> GetSamplers() const { + const std::vector<SamplerEntry>& GetSamplers() const { return used_samplers; } /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if /// necessary. - std::string AccessSampler(const Sampler& sampler) { - size_t offset = static_cast<size_t>(sampler.index.Value()); + std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type, + bool is_array) { + const std::size_t offset = static_cast<std::size_t>(sampler.index.Value()); // If this sampler has already been used, return the existing mapping. - auto itr = + const auto itr = std::find_if(used_samplers.begin(), used_samplers.end(), [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; }); if (itr != used_samplers.end()) { + ASSERT(itr->GetType() == type && itr->IsArray() == is_array); return itr->GetName(); } // Otherwise create a new mapping for this sampler - size_t next_index = used_samplers.size(); - SamplerEntry entry{stage, offset, next_index}; + const std::size_t next_index = used_samplers.size(); + const SamplerEntry entry{stage, offset, next_index, type, is_array}; used_samplers.emplace_back(entry); return entry.GetName(); } @@ -527,16 +572,29 @@ private: void BuildRegisterList() { regs.reserve(Register::NumRegisters); - for (size_t index = 0; index < Register::NumRegisters; ++index) { + for (std::size_t index = 0; index < Register::NumRegisters; ++index) { regs.emplace_back(index, suffix); } } + void BuildInputList() { + const u32 size = static_cast<u32>(Attribute::Index::Attribute_31) - + static_cast<u32>(Attribute::Index::Attribute_0) + 1; + declr_input_attribute.reserve(size); + } + /// Generates code representing an input attribute register. - std::string GetInputAttribute(Attribute::Index attribute) { + std::string GetInputAttribute(Attribute::Index attribute, + const Tegra::Shader::IpaMode& input_mode) { switch (attribute) { case Attribute::Index::Position: - return "position"; + if (stage != Maxwell3D::Regs::ShaderStage::Fragment) { + return "position"; + } else { + return "vec4(gl_FragCoord.x, gl_FragCoord.y, gl_FragCoord.z, 1.0)"; + } + case Attribute::Index::PointCoord: + return "vec4(gl_PointCoord.x, gl_PointCoord.y, 0, 0)"; case Attribute::Index::TessCoordInstanceIDVertexID: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval @@ -552,7 +610,14 @@ private: static_cast<u32>(Attribute::Index::Attribute_0)}; if (attribute >= Attribute::Index::Attribute_0 && attribute <= Attribute::Index::Attribute_31) { - declr_input_attribute.insert(attribute); + if (declr_input_attribute.count(attribute) == 0) { + declr_input_attribute[attribute] = input_mode; + } else { + if (declr_input_attribute[attribute] != input_mode) { + LOG_CRITICAL(HW_GPU, "Same Input multiple input modes"); + UNREACHABLE(); + } + } return "input_attribute_" + std::to_string(index); } @@ -563,6 +628,49 @@ private: return "vec4(0, 0, 0, 0)"; } + std::string GetInputFlags(const Attribute::Index attribute) { + const Tegra::Shader::IpaSampleMode sample_mode = + declr_input_attribute[attribute].sampling_mode; + const Tegra::Shader::IpaInterpMode interp_mode = + declr_input_attribute[attribute].interpolation_mode; + std::string out; + switch (interp_mode) { + case Tegra::Shader::IpaInterpMode::Flat: { + out += "flat "; + break; + } + case Tegra::Shader::IpaInterpMode::Linear: { + out += "noperspective "; + break; + } + case Tegra::Shader::IpaInterpMode::Perspective: { + // Default, Smooth + break; + } + default: { + LOG_CRITICAL(HW_GPU, "Unhandled Ipa InterpMode: {}", static_cast<u32>(interp_mode)); + UNREACHABLE(); + } + } + switch (sample_mode) { + case Tegra::Shader::IpaSampleMode::Centroid: { + // Note not implemented, it can be implemented with the "centroid " keyword in glsl; + LOG_CRITICAL(HW_GPU, "Ipa Sampler Mode: centroid, not implemented"); + UNREACHABLE(); + break; + } + case Tegra::Shader::IpaSampleMode::Default: { + // Default, n/a + break; + } + default: { + LOG_CRITICAL(HW_GPU, "Unhandled Ipa SampleMode: {}", static_cast<u32>(sample_mode)); + UNREACHABLE(); + } + } + return out; + } + /// Generates code representing an output attribute register. std::string GetOutputAttribute(Attribute::Index attribute) { switch (attribute) { @@ -593,7 +701,7 @@ private: ShaderWriter& shader; ShaderWriter& declarations; std::vector<GLSLRegister> regs; - std::set<Attribute::Index> declr_input_attribute; + std::unordered_map<Attribute::Index, Tegra::Shader::IpaMode> declr_input_attribute; std::set<Attribute::Index> declr_output_attribute; std::array<ConstBufferEntry, Maxwell3D::Regs::MaxConstBuffers> declr_const_buffers; std::vector<SamplerEntry> used_samplers; @@ -607,7 +715,7 @@ public: u32 main_offset, Maxwell3D::Regs::ShaderStage stage, const std::string& suffix) : subroutines(subroutines), program_code(program_code), main_offset(main_offset), stage(stage), suffix(suffix) { - + std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); Generate(suffix); } @@ -621,26 +729,9 @@ public: } private: - // Shader program header for a Fragment Shader. - struct FragmentHeader { - INSERT_PADDING_WORDS(5); - INSERT_PADDING_WORDS(13); - u32 enabled_color_outputs; - union { - BitField<0, 1, u32> writes_samplemask; - BitField<1, 1, u32> writes_depth; - }; - - bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const { - u32 bit = render_target * 4 + component; - return enabled_color_outputs & (1 << bit); - } - }; - static_assert(sizeof(FragmentHeader) == PROGRAM_HEADER_SIZE, "FragmentHeader size is wrong"); - /// Gets the Subroutine object corresponding to the specified address. const Subroutine& GetSubroutine(u32 begin, u32 end) const { - auto iter = subroutines.find(Subroutine{begin, end, suffix}); + const auto iter = subroutines.find(Subroutine{begin, end, suffix}); ASSERT(iter != subroutines.end()); return *iter; } @@ -656,8 +747,8 @@ private: } /// Generates code representing a texture sampler. - std::string GetSampler(const Sampler& sampler) { - return regs.AccessSampler(sampler); + std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array) { + return regs.AccessSampler(sampler, type, is_array); } /** @@ -685,7 +776,7 @@ private: // Can't assign to the constant predicate. ASSERT(pred != static_cast<u64>(Pred::UnusedIndex)); - std::string variable = 'p' + std::to_string(pred) + '_' + suffix; + const std::string variable = 'p' + std::to_string(pred) + '_' + suffix; shader.AddLine(variable + " = " + value + ';'); declr_predicates.insert(std::move(variable)); } @@ -795,7 +886,7 @@ private: */ bool IsSchedInstruction(u32 offset) const { // sched instructions appear once every 4 instructions. - static constexpr size_t SchedPeriod = 4; + static constexpr std::size_t SchedPeriod = 4; u32 absolute_offset = offset - main_offset; return (absolute_offset % SchedPeriod) == 0; @@ -863,7 +954,7 @@ private: std::string result; result += '('; - for (size_t i = 0; i < shift_amounts.size(); ++i) { + for (std::size_t i = 0; i < shift_amounts.size(); ++i) { if (i) result += '|'; result += "(((" + imm_lut + " >> (((" + op_c + " >> " + shift_amounts[i] + @@ -887,7 +978,7 @@ private: // TEXS has two destination registers and a swizzle. The first two elements in the swizzle // go into gpr0+0 and gpr0+1, and the rest goes into gpr28+0 and gpr28+1 - size_t written_components = 0; + std::size_t written_components = 0; for (u32 component = 0; component < 4; ++component) { if (!instr.texs.IsComponentEnabled(component)) { continue; @@ -941,10 +1032,8 @@ private: /// Writes the output values from a fragment shader to the corresponding GLSL output variables. void EmitFragmentOutputsWrite() { ASSERT(stage == Maxwell3D::Regs::ShaderStage::Fragment); - FragmentHeader header; - std::memcpy(&header, program_code.data(), PROGRAM_HEADER_SIZE); - ASSERT_MSG(header.writes_samplemask == 0, "Samplemask write is unimplemented"); + ASSERT_MSG(header.ps.omap.sample_mask == 0, "Samplemask write is unimplemented"); // Write the color outputs using the data in the shader registers, disabled // rendertargets/components are skipped in the register assignment. @@ -953,18 +1042,22 @@ private: ++render_target) { // TODO(Subv): Figure out how dual-source blending is configured in the Switch. for (u32 component = 0; component < 4; ++component) { - if (header.IsColorComponentOutputEnabled(render_target, component)) { - shader.AddLine(fmt::format("color[{}][{}] = {};", render_target, component, + if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { + shader.AddLine(fmt::format("FragColor{}[{}] = {};", render_target, component, regs.GetRegisterAsFloat(current_reg))); ++current_reg; } } } - if (header.writes_depth) { + if (header.ps.omap.depth) { // The depth output is always 2 registers after the last color output, and current_reg // already contains one past the last color register. - shader.AddLine("gl_FragDepth = " + regs.GetRegisterAsFloat(current_reg + 1) + ';'); + + shader.AddLine( + "gl_FragDepth = " + + regs.GetRegisterAsFloat(static_cast<Tegra::Shader::Register>(current_reg) + 1) + + ';'); } } @@ -1038,6 +1131,15 @@ private: case OpCode::Id::FMUL_R: case OpCode::Id::FMUL_IMM: { // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit. + ASSERT_MSG(instr.fmul.tab5cb8_2 == 0, "FMUL tab5cb8_2({}) is not implemented", + instr.fmul.tab5cb8_2.Value()); + ASSERT_MSG(instr.fmul.tab5c68_1 == 0, "FMUL tab5cb8_1({}) is not implemented", + instr.fmul.tab5c68_1.Value()); + ASSERT_MSG(instr.fmul.tab5c68_0 == 1, "FMUL tab5cb8_0({}) is not implemented", + instr.fmul.tab5c68_0 + .Value()); // SMO typical sends 1 here which seems to be the default + ASSERT_MSG(instr.fmul.cc == 0, "FMUL cc is not implemented"); + op_b = GetOperandAbsNeg(op_b, false, instr.fmul.negate_b); regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.saturate_d); @@ -1357,7 +1459,7 @@ private: if (instr.alu_integer.negate_b) op_b = "-(" + op_b + ')'; - std::string shift = std::to_string(instr.alu_integer.shift_amount.Value()); + const std::string shift = std::to_string(instr.alu_integer.shift_amount.Value()); regs.SetRegisterToInteger(instr.gpr0, true, 0, "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1); @@ -1375,7 +1477,7 @@ private: case OpCode::Id::SEL_C: case OpCode::Id::SEL_R: case OpCode::Id::SEL_IMM: { - std::string condition = + const std::string condition = GetPredicateCondition(instr.sel.pred, instr.sel.neg_pred != 0); regs.SetRegisterToInteger(instr.gpr0, true, 0, '(' + condition + ") ? " + op_a + " : " + op_b, 1, 1); @@ -1397,8 +1499,9 @@ private: case OpCode::Id::LOP3_C: case OpCode::Id::LOP3_R: case OpCode::Id::LOP3_IMM: { - std::string op_c = regs.GetRegisterAsInteger(instr.gpr39); + const std::string op_c = regs.GetRegisterAsInteger(instr.gpr39); std::string lut; + if (opcode->GetId() == OpCode::Id::LOP3_R) { lut = '(' + std::to_string(instr.alu.lop3.GetImmLut28()) + ')'; } else { @@ -1413,15 +1516,80 @@ private: case OpCode::Id::IMNMX_IMM: { ASSERT_MSG(instr.imnmx.exchange == Tegra::Shader::IMinMaxExchange::None, "Unimplemented"); - std::string condition = + const std::string condition = GetPredicateCondition(instr.imnmx.pred, instr.imnmx.negate_pred != 0); - std::string parameters = op_a + ',' + op_b; + const std::string parameters = op_a + ',' + op_b; regs.SetRegisterToInteger(instr.gpr0, instr.imnmx.is_signed, 0, '(' + condition + ") ? min(" + parameters + ") : max(" + parameters + ')', 1, 1); break; } + case OpCode::Id::LEA_R2: + case OpCode::Id::LEA_R1: + case OpCode::Id::LEA_IMM: + case OpCode::Id::LEA_RZ: + case OpCode::Id::LEA_HI: { + std::string op_c; + + switch (opcode->GetId()) { + case OpCode::Id::LEA_R2: { + op_a = regs.GetRegisterAsInteger(instr.gpr20); + op_b = regs.GetRegisterAsInteger(instr.gpr39); + op_c = std::to_string(instr.lea.r2.entry_a); + break; + } + + case OpCode::Id::LEA_R1: { + const bool neg = instr.lea.r1.neg != 0; + op_a = regs.GetRegisterAsInteger(instr.gpr8); + if (neg) + op_a = "-(" + op_a + ')'; + op_b = regs.GetRegisterAsInteger(instr.gpr20); + op_c = std::to_string(instr.lea.r1.entry_a); + break; + } + + case OpCode::Id::LEA_IMM: { + const bool neg = instr.lea.imm.neg != 0; + op_b = regs.GetRegisterAsInteger(instr.gpr8); + if (neg) + op_b = "-(" + op_b + ')'; + op_a = std::to_string(instr.lea.imm.entry_a); + op_c = std::to_string(instr.lea.imm.entry_b); + break; + } + + case OpCode::Id::LEA_RZ: { + const bool neg = instr.lea.rz.neg != 0; + op_b = regs.GetRegisterAsInteger(instr.gpr8); + if (neg) + op_b = "-(" + op_b + ')'; + op_a = regs.GetUniform(instr.lea.rz.cb_index, instr.lea.rz.cb_offset, + GLSLRegister::Type::Integer); + op_c = std::to_string(instr.lea.rz.entry_a); + + break; + } + + case OpCode::Id::LEA_HI: + default: { + op_b = regs.GetRegisterAsInteger(instr.gpr8); + op_a = std::to_string(instr.lea.imm.entry_a); + op_c = std::to_string(instr.lea.imm.entry_b); + LOG_CRITICAL(HW_GPU, "Unhandled LEA subinstruction: {}", opcode->GetName()); + UNREACHABLE(); + } + } + if (instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex)) { + LOG_ERROR(HW_GPU, "Unhandled LEA Predicate"); + UNREACHABLE(); + } + const std::string value = '(' + op_a + " + (" + op_b + "*(1 << " + op_c + ")))"; + regs.SetRegisterToInteger(instr.gpr0, true, 0, value, 1, 1); + + break; + } default: { LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}", opcode->GetName()); @@ -1432,10 +1600,16 @@ private: break; } case OpCode::Type::Ffma: { - std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); + const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); std::string op_b = instr.ffma.negate_b ? "-" : ""; std::string op_c = instr.ffma.negate_c ? "-" : ""; + ASSERT_MSG(instr.ffma.cc == 0, "FFMA cc not implemented"); + ASSERT_MSG(instr.ffma.tab5980_0 == 1, "FFMA tab5980_0({}) not implemented", + instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO + ASSERT_MSG(instr.ffma.tab5980_1 == 0, "FFMA tab5980_1({}) not implemented", + instr.ffma.tab5980_1.Value()); + switch (opcode->GetId()) { case OpCode::Id::FFMA_CR: { op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset, @@ -1486,7 +1660,8 @@ private: } regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1, - 1, instr.alu.saturate_d, 0, instr.conversion.dest_size); + 1, instr.alu.saturate_d, 0, instr.conversion.dest_size, + instr.generates_cc.Value() != 0); break; } case OpCode::Id::I2F_R: @@ -1616,9 +1791,34 @@ private: case OpCode::Type::Memory: { switch (opcode->GetId()) { case OpCode::Id::LD_A: { - ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested"); - regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element, - instr.attribute.fmt20.index); + // Note: Shouldn't this be interp mode flat? As in no interpolation made. + ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex, + "Indirect attribute loads are not supported"); + ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0, + "Unaligned attribute loads are not supported"); + + Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Perspective, + Tegra::Shader::IpaSampleMode::Default}; + + u64 next_element = instr.attribute.fmt20.element; + u64 next_index = static_cast<u64>(instr.attribute.fmt20.index.Value()); + + const auto LoadNextElement = [&](u32 reg_offset) { + regs.SetRegisterToInputAttibute(instr.gpr0.Value() + reg_offset, next_element, + static_cast<Attribute::Index>(next_index), + input_mode); + + // Load the next attribute element into the following register. If the element + // to load goes beyond the vec4 size, load the first element of the next + // attribute. + next_element = (next_element + 1) % 4; + next_index = next_index + (next_element == 0 ? 1 : 0); + }; + + const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1; + for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) { + LoadNextElement(reg_offset); + } break; } case OpCode::Id::LD_C: { @@ -1632,7 +1832,7 @@ private: shader.AddLine("uint index = (" + regs.GetRegisterAsInteger(instr.gpr8, 0, false) + " / 4) & (MAX_CONSTBUFFER_ELEMENTS - 1);"); - std::string op_a = + const std::string op_a = regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, "index", GLSLRegister::Type::Float); @@ -1642,7 +1842,7 @@ private: break; case Tegra::Shader::UniformType::Double: { - std::string op_b = + const std::string op_b = regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4, "index", GLSLRegister::Type::Float); regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1); @@ -1660,25 +1860,111 @@ private: break; } case OpCode::Id::ST_A: { - ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested"); - regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index, - instr.attribute.fmt20.element, instr.gpr0); + ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex, + "Indirect attribute loads are not supported"); + ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0, + "Unaligned attribute loads are not supported"); + + u64 next_element = instr.attribute.fmt20.element; + u64 next_index = static_cast<u64>(instr.attribute.fmt20.index.Value()); + + const auto StoreNextElement = [&](u32 reg_offset) { + regs.SetOutputAttributeToRegister(static_cast<Attribute::Index>(next_index), + next_element, + instr.gpr0.Value() + reg_offset); + + // Load the next attribute element into the following register. If the element + // to load goes beyond the vec4 size, load the first element of the next + // attribute. + next_element = (next_element + 1) % 4; + next_index = next_index + (next_element == 0 ? 1 : 0); + }; + + const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1; + for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) { + StoreNextElement(reg_offset); + } + break; } case OpCode::Id::TEX: { - const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); - const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); - const std::string sampler = GetSampler(instr.sampler); - const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");"; + ASSERT_MSG(instr.tex.array == 0, "TEX arrays unimplemented"); + Tegra::Shader::TextureType texture_type{instr.tex.texture_type}; + std::string coord; + + ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC), + "DC is not implemented"); + + switch (texture_type) { + case Tegra::Shader::TextureType::Texture1D: { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + coord = "float coords = " + x + ';'; + break; + } + case Tegra::Shader::TextureType::Texture2D: { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + break; + } + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", + static_cast<u32>(texture_type)); + UNREACHABLE(); + + // Fallback to interpreting as a 2D texture for now + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + texture_type = Tegra::Shader::TextureType::Texture2D; + } + // TODO: make sure coordinates are always indexed to gpr8 and gpr20 is always bias + // or lod. + const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20); + + const std::string sampler = GetSampler(instr.sampler, texture_type, false); // Add an extra scope and declare the texture coords inside to prevent // overwriting them in case they are used as outputs of the texs instruction. + shader.AddLine("{"); ++shader.scope; shader.AddLine(coord); - const std::string texture = "texture(" + sampler + ", coords)"; + std::string texture; - size_t dest_elem{}; - for (size_t elem = 0; elem < 4; ++elem) { + switch (instr.tex.process_mode) { + case Tegra::Shader::TextureProcessMode::None: { + texture = "texture(" + sampler + ", coords)"; + break; + } + case Tegra::Shader::TextureProcessMode::LZ: { + texture = "textureLod(" + sampler + ", coords, 0.0)"; + break; + } + case Tegra::Shader::TextureProcessMode::LB: + case Tegra::Shader::TextureProcessMode::LBA: { + // TODO: Figure if A suffix changes the equation at all. + texture = "texture(" + sampler + ", coords, " + op_c + ')'; + break; + } + case Tegra::Shader::TextureProcessMode::LL: + case Tegra::Shader::TextureProcessMode::LLA: { + // TODO: Figure if A suffix changes the equation at all. + texture = "textureLod(" + sampler + ", coords, " + op_c + ')'; + break; + } + default: { + texture = "texture(" + sampler + ", coords)"; + LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}", + static_cast<u32>(instr.tex.process_mode.Value())); + UNREACHABLE(); + } + } + std::size_t dest_elem{}; + for (std::size_t elem = 0; elem < 4; ++elem) { if (!instr.tex.IsComponentEnabled(elem)) { // Skip disabled components continue; @@ -1691,20 +1977,77 @@ private: break; } case OpCode::Id::TEXS: { - const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); - const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20); - const std::string sampler = GetSampler(instr.sampler); - const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");"; + std::string coord; + Tegra::Shader::TextureType texture_type{instr.texs.GetTextureType()}; + bool is_array{instr.texs.IsArrayTexture()}; + + ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC), + "DC is not implemented"); + + switch (texture_type) { + case Tegra::Shader::TextureType::Texture2D: { + if (is_array) { + const std::string index = regs.GetRegisterAsInteger(instr.gpr8); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string y = regs.GetRegisterAsFloat(instr.gpr20); + coord = "vec3 coords = vec3(" + x + ", " + y + ", " + index + ");"; + } else { + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr20); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + } + break; + } + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", + static_cast<u32>(texture_type)); + UNREACHABLE(); + // Fallback to interpreting as a 2D texture for now + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr20); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + texture_type = Tegra::Shader::TextureType::Texture2D; + is_array = false; + } + const std::string sampler = GetSampler(instr.sampler, texture_type, is_array); const std::string texture = "texture(" + sampler + ", coords)"; WriteTexsInstruction(instr, coord, texture); break; } case OpCode::Id::TLDS: { - const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8); - const std::string op_b = regs.GetRegisterAsInteger(instr.gpr20); - const std::string sampler = GetSampler(instr.sampler); - const std::string coord = "ivec2 coords = ivec2(" + op_a + ", " + op_b + ");"; + ASSERT(instr.tlds.GetTextureType() == Tegra::Shader::TextureType::Texture2D); + ASSERT(instr.tlds.IsArrayTexture() == false); + std::string coord; + + ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + ASSERT_MSG(!instr.tlds.UsesMiscMode(Tegra::Shader::TextureMiscMode::MZ), + "MZ is not implemented"); + + switch (instr.tlds.GetTextureType()) { + case Tegra::Shader::TextureType::Texture2D: { + if (instr.tlds.IsArrayTexture()) { + LOG_CRITICAL(HW_GPU, "Unhandled 2d array texture"); + UNREACHABLE(); + } else { + const std::string x = regs.GetRegisterAsInteger(instr.gpr8); + const std::string y = regs.GetRegisterAsInteger(instr.gpr20); + coord = "ivec2 coords = ivec2(" + x + ", " + y + ");"; + } + break; + } + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", + static_cast<u32>(instr.tlds.GetTextureType())); + UNREACHABLE(); + } + const std::string sampler = GetSampler(instr.sampler, instr.tlds.GetTextureType(), + instr.tlds.IsArrayTexture()); const std::string texture = "texelFetch(" + sampler + ", coords, 0)"; WriteTexsInstruction(instr, coord, texture); break; @@ -1712,12 +2055,23 @@ private: case OpCode::Id::TLD4: { ASSERT(instr.tld4.texture_type == Tegra::Shader::TextureType::Texture2D); ASSERT(instr.tld4.array == 0); - std::string coord{}; + std::string coord; + + ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC), + "DC is not implemented"); + ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), + "NDV is not implemented"); + ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::PTP), + "PTP is not implemented"); switch (instr.tld4.texture_type) { case Tegra::Shader::TextureType::Texture2D: { - std::string x = regs.GetRegisterAsFloat(instr.gpr8); - std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const std::string x = regs.GetRegisterAsFloat(instr.gpr8); + const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); coord = "vec2 coords = vec2(" + x + ", " + y + ");"; break; } @@ -1727,7 +2081,8 @@ private: UNREACHABLE(); } - const std::string sampler = GetSampler(instr.sampler); + const std::string sampler = + GetSampler(instr.sampler, instr.tld4.texture_type, false); // Add an extra scope and declare the texture coords inside to prevent // overwriting them in case they are used as outputs of the texs instruction. shader.AddLine("{"); @@ -1736,8 +2091,8 @@ private: const std::string texture = "textureGather(" + sampler + ", coords, " + std::to_string(instr.tld4.component) + ')'; - size_t dest_elem{}; - for (size_t elem = 0; elem < 4; ++elem) { + std::size_t dest_elem{}; + for (std::size_t elem = 0; elem < 4; ++elem) { if (!instr.tex.IsComponentEnabled(elem)) { // Skip disabled components continue; @@ -1750,16 +2105,100 @@ private: break; } case OpCode::Id::TLD4S: { + ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC), + "DC is not implemented"); + const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20); // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction. - const std::string sampler = GetSampler(instr.sampler); + const std::string sampler = + GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false); const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");"; const std::string texture = "textureGather(" + sampler + ", coords, " + std::to_string(instr.tld4s.component) + ')'; WriteTexsInstruction(instr, coord, texture); break; } + case OpCode::Id::TXQ: { + ASSERT_MSG(!instr.txq.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + + // TODO: the new commits on the texture refactor, change the way samplers work. + // Sadly, not all texture instructions specify the type of texture their sampler + // uses. This must be fixed at a later instance. + const std::string sampler = + GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false); + switch (instr.txq.query_type) { + case Tegra::Shader::TextureQueryType::Dimension: { + const std::string texture = "textureQueryLevels(" + sampler + ')'; + regs.SetRegisterToInteger(instr.gpr0, true, 0, texture, 1, 1); + break; + } + default: { + LOG_CRITICAL(HW_GPU, "Unhandled texture query type: {}", + static_cast<u32>(instr.txq.query_type.Value())); + UNREACHABLE(); + } + } + break; + } + case OpCode::Id::TMML: { + ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP), + "NODEP is not implemented"); + ASSERT_MSG(!instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), + "NDV is not implemented"); + + const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8); + const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + const bool is_array = instr.tmml.array != 0; + auto texture_type = instr.tmml.texture_type.Value(); + const std::string sampler = GetSampler(instr.sampler, texture_type, is_array); + + // TODO: add coordinates for different samplers once other texture types are + // implemented. + std::string coord; + switch (texture_type) { + case Tegra::Shader::TextureType::Texture1D: { + std::string x = regs.GetRegisterAsFloat(instr.gpr8); + coord = "float coords = " + x + ';'; + break; + } + case Tegra::Shader::TextureType::Texture2D: { + std::string x = regs.GetRegisterAsFloat(instr.gpr8); + std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + break; + } + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", + static_cast<u32>(texture_type)); + UNREACHABLE(); + + // Fallback to interpreting as a 2D texture for now + std::string x = regs.GetRegisterAsFloat(instr.gpr8); + std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1); + coord = "vec2 coords = vec2(" + x + ", " + y + ");"; + texture_type = Tegra::Shader::TextureType::Texture2D; + } + // Add an extra scope and declare the texture coords inside to prevent + // overwriting them in case they are used as outputs of the texs instruction. + shader.AddLine('{'); + ++shader.scope; + shader.AddLine(coord); + const std::string texture = "textureQueryLod(" + sampler + ", coords)"; + const std::string tmp = "vec2 tmp = " + texture + "*vec2(256.0, 256.0);"; + shader.AddLine(tmp); + + regs.SetRegisterToInteger(instr.gpr0, true, 0, "int(tmp.y)", 1, 1); + regs.SetRegisterToInteger(instr.gpr0.Value() + 1, false, 0, "uint(tmp.x)", 1, 1); + --shader.scope; + shader.AddLine('}'); + break; + } default: { LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName()); UNREACHABLE(); @@ -1799,12 +2238,12 @@ private: // We can't use the constant predicate as destination. ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); - std::string second_pred = + const std::string second_pred = GetPredicateCondition(instr.fsetp.pred39, instr.fsetp.neg_pred != 0); - std::string combiner = GetPredicateCombiner(instr.fsetp.op); + const std::string combiner = GetPredicateCombiner(instr.fsetp.op); - std::string predicate = GetPredicateComparison(instr.fsetp.cond, op_a, op_b); + const std::string predicate = GetPredicateComparison(instr.fsetp.cond, op_a, op_b); // Set the primary predicate to the result of Predicate OP SecondPredicate SetPredicate(instr.fsetp.pred3, '(' + predicate + ") " + combiner + " (" + second_pred + ')'); @@ -1818,7 +2257,8 @@ private: break; } case OpCode::Type::IntegerSetPredicate: { - std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed); + const std::string op_a = + regs.GetRegisterAsInteger(instr.gpr8, 0, instr.isetp.is_signed); std::string op_b; if (instr.is_b_imm) { @@ -1835,12 +2275,12 @@ private: // We can't use the constant predicate as destination. ASSERT(instr.isetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); - std::string second_pred = + const std::string second_pred = GetPredicateCondition(instr.isetp.pred39, instr.isetp.neg_pred != 0); - std::string combiner = GetPredicateCombiner(instr.isetp.op); + const std::string combiner = GetPredicateCombiner(instr.isetp.op); - std::string predicate = GetPredicateComparison(instr.isetp.cond, op_a, op_b); + const std::string predicate = GetPredicateComparison(instr.isetp.cond, op_a, op_b); // Set the primary predicate to the result of Predicate OP SecondPredicate SetPredicate(instr.isetp.pred3, '(' + predicate + ") " + combiner + " (" + second_pred + ')'); @@ -1853,32 +2293,80 @@ private: } break; } + case OpCode::Type::PredicateSetRegister: { + const std::string op_a = + GetPredicateCondition(instr.pset.pred12, instr.pset.neg_pred12 != 0); + const std::string op_b = + GetPredicateCondition(instr.pset.pred29, instr.pset.neg_pred29 != 0); + + const std::string second_pred = + GetPredicateCondition(instr.pset.pred39, instr.pset.neg_pred39 != 0); + + const std::string combiner = GetPredicateCombiner(instr.pset.op); + + const std::string predicate = + '(' + op_a + ") " + GetPredicateCombiner(instr.pset.cond) + " (" + op_b + ')'; + const std::string result = '(' + predicate + ") " + combiner + " (" + second_pred + ')'; + if (instr.pset.bf == 0) { + const std::string value = '(' + result + ") ? 0xFFFFFFFF : 0"; + regs.SetRegisterToInteger(instr.gpr0, false, 0, value, 1, 1); + } else { + const std::string value = '(' + result + ") ? 1.0 : 0.0"; + regs.SetRegisterToFloat(instr.gpr0, 0, value, 1, 1); + } + + break; + } case OpCode::Type::PredicateSetPredicate: { - std::string op_a = - GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0); - std::string op_b = - GetPredicateCondition(instr.psetp.pred29, instr.psetp.neg_pred29 != 0); + switch (opcode->GetId()) { + case OpCode::Id::PSETP: { + const std::string op_a = + GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0); + const std::string op_b = + GetPredicateCondition(instr.psetp.pred29, instr.psetp.neg_pred29 != 0); - // We can't use the constant predicate as destination. - ASSERT(instr.psetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); + // We can't use the constant predicate as destination. + ASSERT(instr.psetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); - std::string second_pred = - GetPredicateCondition(instr.psetp.pred39, instr.psetp.neg_pred39 != 0); + const std::string second_pred = + GetPredicateCondition(instr.psetp.pred39, instr.psetp.neg_pred39 != 0); - std::string combiner = GetPredicateCombiner(instr.psetp.op); + const std::string combiner = GetPredicateCombiner(instr.psetp.op); - std::string predicate = - '(' + op_a + ") " + GetPredicateCombiner(instr.psetp.cond) + " (" + op_b + ')'; + const std::string predicate = + '(' + op_a + ") " + GetPredicateCombiner(instr.psetp.cond) + " (" + op_b + ')'; - // Set the primary predicate to the result of Predicate OP SecondPredicate - SetPredicate(instr.psetp.pred3, - '(' + predicate + ") " + combiner + " (" + second_pred + ')'); + // Set the primary predicate to the result of Predicate OP SecondPredicate + SetPredicate(instr.psetp.pred3, + '(' + predicate + ") " + combiner + " (" + second_pred + ')'); - if (instr.psetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) { - // Set the secondary predicate to the result of !Predicate OP SecondPredicate, - // if enabled - SetPredicate(instr.psetp.pred0, - "!(" + predicate + ") " + combiner + " (" + second_pred + ')'); + if (instr.psetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) { + // Set the secondary predicate to the result of !Predicate OP SecondPredicate, + // if enabled + SetPredicate(instr.psetp.pred0, + "!(" + predicate + ") " + combiner + " (" + second_pred + ')'); + } + break; + } + case OpCode::Id::CSETP: { + const std::string pred = + GetPredicateCondition(instr.csetp.pred39, instr.csetp.neg_pred39 != 0); + const std::string combiner = GetPredicateCombiner(instr.csetp.op); + const std::string controlCode = regs.GetControlCode(instr.csetp.cc); + if (instr.csetp.pred3 != static_cast<u64>(Pred::UnusedIndex)) { + SetPredicate(instr.csetp.pred3, + '(' + controlCode + ") " + combiner + " (" + pred + ')'); + } + if (instr.csetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) { + SetPredicate(instr.csetp.pred0, + "!(" + controlCode + ") " + combiner + " (" + pred + ')'); + } + break; + } + default: { + LOG_CRITICAL(HW_GPU, "Unhandled predicate instruction: {}", opcode->GetName()); + UNREACHABLE(); + } } break; } @@ -1893,7 +2381,7 @@ private: std::string op_b = instr.fset.neg_b ? "-" : ""; if (instr.is_b_imm) { - std::string imm = GetImmediate19(instr); + const std::string imm = GetImmediate19(instr); if (instr.fset.neg_imm) op_b += "(-" + imm + ')'; else @@ -1913,13 +2401,14 @@ private: // The fset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the // condition is true, and to 0 otherwise. - std::string second_pred = + const std::string second_pred = GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0); - std::string combiner = GetPredicateCombiner(instr.fset.op); + const std::string combiner = GetPredicateCombiner(instr.fset.op); - std::string predicate = "((" + GetPredicateComparison(instr.fset.cond, op_a, op_b) + - ") " + combiner + " (" + second_pred + "))"; + const std::string predicate = "((" + + GetPredicateComparison(instr.fset.cond, op_a, op_b) + + ") " + combiner + " (" + second_pred + "))"; if (instr.fset.bf) { regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1); @@ -1930,7 +2419,7 @@ private: break; } case OpCode::Type::IntegerSet: { - std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed); + const std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, instr.iset.is_signed); std::string op_b; @@ -1947,13 +2436,14 @@ private: // The iset instruction sets a register to 1.0 or -1 (depending on the bf bit) if the // condition is true, and to 0 otherwise. - std::string second_pred = + const std::string second_pred = GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0); - std::string combiner = GetPredicateCombiner(instr.iset.op); + const std::string combiner = GetPredicateCombiner(instr.iset.op); - std::string predicate = "((" + GetPredicateComparison(instr.iset.cond, op_a, op_b) + - ") " + combiner + " (" + second_pred + "))"; + const std::string predicate = "((" + + GetPredicateComparison(instr.iset.cond, op_a, op_b) + + ") " + combiner + " (" + second_pred + "))"; if (instr.iset.bf) { regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1); @@ -2103,45 +2593,22 @@ private: case OpCode::Id::BRA: { ASSERT_MSG(instr.bra.constant_buffer == 0, "BRA with constant buffers are not implemented"); - u32 target = offset + instr.bra.GetBranchTarget(); + const u32 target = offset + instr.bra.GetBranchTarget(); shader.AddLine("{ jmp_to = " + std::to_string(target) + "u; break; }"); break; } case OpCode::Id::IPA: { const auto& attribute = instr.attribute.fmt28; const auto& reg = instr.gpr0; - switch (instr.ipa.mode) { - case Tegra::Shader::IpaMode::Pass: - if (stage == Maxwell3D::Regs::ShaderStage::Fragment && - attribute.index == Attribute::Index::Position) { - switch (attribute.element) { - case 0: - shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.x;"); - break; - case 1: - shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.y;"); - break; - case 2: - shader.AddLine(regs.GetRegisterAsFloat(reg) + " = gl_FragCoord.z;"); - break; - case 3: - shader.AddLine(regs.GetRegisterAsFloat(reg) + " = 1.0;"); - break; - } - } else { - regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index); - } - break; - case Tegra::Shader::IpaMode::None: - regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index); - break; - default: - LOG_CRITICAL(HW_GPU, "Unhandled IPA mode: {}", - static_cast<u32>(instr.ipa.mode.Value())); - UNREACHABLE(); - regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index); - } + Tegra::Shader::IpaMode input_mode{instr.ipa.interp_mode.Value(), + instr.ipa.sample_mode.Value()}; + regs.SetRegisterToInputAttibute(reg, attribute.element, attribute.index, + input_mode); + + if (instr.ipa.saturate) { + regs.SetRegisterToFloat(reg, 0, regs.GetRegisterAsFloat(reg), 1, 1, true); + } break; } case OpCode::Id::SSY: { @@ -2150,7 +2617,7 @@ private: // has a similar structure to the BRA opcode. ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported"); - u32 target = offset + instr.bra.GetBranchTarget(); + const u32 target = offset + instr.bra.GetBranchTarget(); EmitPushToSSYStack(target); break; } @@ -2244,10 +2711,10 @@ private: shader.AddLine("case " + std::to_string(label) + "u: {"); ++shader.scope; - auto next_it = labels.lower_bound(label + 1); - u32 next_label = next_it == labels.end() ? subroutine.end : *next_it; + const auto next_it = labels.lower_bound(label + 1); + const u32 next_label = next_it == labels.end() ? subroutine.end : *next_it; - u32 compile_end = CompileRange(label, next_label); + const u32 compile_end = CompileRange(label, next_label); if (compile_end > next_label && compile_end != PROGRAM_END) { // This happens only when there is a label inside a IF/LOOP block shader.AddLine(" jmp_to = " + std::to_string(compile_end) + "u; break; }"); @@ -2289,6 +2756,7 @@ private: private: const std::set<Subroutine>& subroutines; const ProgramCode& program_code; + Tegra::Shader::Header header; const u32 main_offset; Maxwell3D::Regs::ShaderStage stage; const std::string& suffix; @@ -2310,7 +2778,8 @@ boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, Maxwell3D::Regs::ShaderStage stage, const std::string& suffix) { try { - auto subroutines = ControlFlowAnalyzer(program_code, main_offset, suffix).GetSubroutines(); + const auto subroutines = + ControlFlowAnalyzer(program_code, main_offset, suffix).GetSubroutines(); GLSLGenerator generator(subroutines, program_code, main_offset, stage, suffix); return ProgramResult{generator.GetShaderCode(), generator.GetEntries()}; } catch (const DecompileFail& exception) { diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 6ca05945e..b0466c18f 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -42,6 +42,7 @@ layout (std140) uniform vs_config { }; void main() { + position = vec4(0.0, 0.0, 0.0, 0.0); exec_vertex(); )"; @@ -87,7 +88,14 @@ ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { .get_value_or({}); out += R"( in vec4 position; -layout(location = 0) out vec4 color[8]; +layout(location = 0) out vec4 FragColor0; +layout(location = 1) out vec4 FragColor1; +layout(location = 2) out vec4 FragColor2; +layout(location = 3) out vec4 FragColor3; +layout(location = 4) out vec4 FragColor4; +layout(location = 5) out vec4 FragColor5; +layout(location = 6) out vec4 FragColor6; +layout(location = 7) out vec4 FragColor7; layout (std140) uniform fs_config { vec4 viewport_flip; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index c788099d4..d53b93ad5 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -9,10 +9,11 @@ #include <vector> #include "common/common_types.h" +#include "video_core/engines/shader_bytecode.h" namespace OpenGL::GLShader { -constexpr size_t MAX_PROGRAM_CODE_LENGTH{0x1000}; +constexpr std::size_t MAX_PROGRAM_CODE_LENGTH{0x1000}; using ProgramCode = std::vector<u64>; class ConstBufferEntry { @@ -50,7 +51,11 @@ public: } std::string GetName() const { - return BufferBaseNames[static_cast<size_t>(stage)] + std::to_string(index); + return BufferBaseNames[static_cast<std::size_t>(stage)] + std::to_string(index); + } + + u32 GetHash() const { + return (static_cast<u32>(stage) << 16) | index; } private: @@ -69,14 +74,15 @@ class SamplerEntry { using Maxwell = Tegra::Engines::Maxwell3D::Regs; public: - SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index) - : offset(offset), stage(stage), sampler_index(index) {} + SamplerEntry(Maxwell::ShaderStage stage, std::size_t offset, std::size_t index, + Tegra::Shader::TextureType type, bool is_array) + : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {} - size_t GetOffset() const { + std::size_t GetOffset() const { return offset; } - size_t GetIndex() const { + std::size_t GetIndex() const { return sampler_index; } @@ -85,23 +91,63 @@ public: } std::string GetName() const { - return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '[' + - std::to_string(sampler_index) + ']'; + return std::string(TextureSamplerNames[static_cast<std::size_t>(stage)]) + '_' + + std::to_string(sampler_index); + } + + std::string GetTypeString() const { + using Tegra::Shader::TextureType; + std::string glsl_type; + + switch (type) { + case TextureType::Texture1D: + glsl_type = "sampler1D"; + break; + case TextureType::Texture2D: + glsl_type = "sampler2D"; + break; + case TextureType::Texture3D: + glsl_type = "sampler3D"; + break; + case TextureType::TextureCube: + glsl_type = "samplerCube"; + break; + default: + UNIMPLEMENTED(); + } + if (is_array) + glsl_type += "Array"; + return glsl_type; + } + + Tegra::Shader::TextureType GetType() const { + return type; + } + + bool IsArray() const { + return is_array; + } + + u32 GetHash() const { + return (static_cast<u32>(stage) << 16) | static_cast<u32>(sampler_index); } static std::string GetArrayName(Maxwell::ShaderStage stage) { - return TextureSamplerNames[static_cast<size_t>(stage)]; + return TextureSamplerNames[static_cast<std::size_t>(stage)]; } private: static constexpr std::array<const char*, Maxwell::MaxShaderStage> TextureSamplerNames = { "tex_vs", "tex_tessc", "tex_tesse", "tex_gs", "tex_fs", }; + /// Offset in TSC memory from which to read the sampler object, as specified by the sampling /// instruction. - size_t offset; - Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used. - size_t sampler_index; ///< Value used to index into the generated GLSL sampler array. + std::size_t offset; + Maxwell::ShaderStage stage; ///< Shader stage where this sampler was used. + std::size_t sampler_index; ///< Value used to index into the generated GLSL sampler array. + Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc) + bool is_array; ///< Whether the texture is being sampled as an array texture or not. }; struct ShaderEntries { diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 533e42caa..b86cd96e8 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -12,7 +12,7 @@ namespace OpenGL::GLShader { /// Number of OpenGL texture samplers that can be used in the fragment shader -static constexpr size_t NumTextureSamplers = 32; +static constexpr std::size_t NumTextureSamplers = 32; using Tegra::Engines::Maxwell3D; diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 5781d9d16..5f3fe067e 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -25,7 +25,7 @@ GLuint LoadShader(const char* source, GLenum type) { default: UNREACHABLE(); } - GLuint shader_id = glCreateShader(type); + const GLuint shader_id = glCreateShader(type); glShaderSource(shader_id, 1, &source, nullptr); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); glCompileShader(shader_id); diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 60a4defd1..af99132ba 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -200,9 +200,9 @@ void OpenGLState::Apply() const { const auto& texture_unit = texture_units[i]; const auto& cur_state_texture_unit = cur_state.texture_units[i]; - if (texture_unit.texture_2d != cur_state_texture_unit.texture_2d) { + if (texture_unit.texture != cur_state_texture_unit.texture) { glActiveTexture(TextureUnits::MaxwellTexture(static_cast<int>(i)).Enum()); - glBindTexture(GL_TEXTURE_2D, texture_unit.texture_2d); + glBindTexture(texture_unit.target, texture_unit.texture); } if (texture_unit.sampler != cur_state_texture_unit.sampler) { glBindSampler(static_cast<GLuint>(i), texture_unit.sampler); @@ -214,7 +214,7 @@ void OpenGLState::Apply() const { texture_unit.swizzle.a != cur_state_texture_unit.swizzle.a) { std::array<GLint, 4> mask = {texture_unit.swizzle.r, texture_unit.swizzle.g, texture_unit.swizzle.b, texture_unit.swizzle.a}; - glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, mask.data()); + glTexParameteriv(texture_unit.target, GL_TEXTURE_SWIZZLE_RGBA, mask.data()); } } @@ -272,7 +272,7 @@ void OpenGLState::Apply() const { } // Clip distance - for (size_t i = 0; i < clip_distance.size(); ++i) { + for (std::size_t i = 0; i < clip_distance.size(); ++i) { if (clip_distance[i] != cur_state.clip_distance[i]) { if (clip_distance[i]) { glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i)); @@ -287,7 +287,7 @@ void OpenGLState::Apply() const { OpenGLState& OpenGLState::UnbindTexture(GLuint handle) { for (auto& unit : texture_units) { - if (unit.texture_2d == handle) { + if (unit.texture == handle) { unit.Unbind(); } } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 46e96a97d..e3e24b9e7 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -94,8 +94,9 @@ public: // 3 texture units - one for each that is used in PICA fragment shader emulation struct TextureUnit { - GLuint texture_2d; // GL_TEXTURE_BINDING_2D - GLuint sampler; // GL_SAMPLER_BINDING + GLuint texture; // GL_TEXTURE_BINDING_2D + GLuint sampler; // GL_SAMPLER_BINDING + GLenum target; struct { GLint r; // GL_TEXTURE_SWIZZLE_R GLint g; // GL_TEXTURE_SWIZZLE_G @@ -104,7 +105,7 @@ public: } swizzle; void Unbind() { - texture_2d = 0; + texture = 0; swizzle.r = GL_RED; swizzle.g = GL_GREEN; swizzle.b = GL_BLUE; @@ -114,6 +115,7 @@ public: void Reset() { Unbind(); sampler = 0; + target = GL_TEXTURE_2D; } }; std::array<TextureUnit, 32> texture_units; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index e565afcee..664f3ca20 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -29,7 +29,7 @@ OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coh if (GLAD_GL_ARB_buffer_storage) { persistent = true; coherent = prefer_coherent; - GLbitfield flags = + const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); glBufferStorage(gl_target, allocate_size, nullptr, flags); mapped_ptr = static_cast<u8*>(glMapBufferRange( @@ -61,7 +61,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a mapped_size = size; if (alignment > 0) { - buffer_pos = Common::AlignUp<size_t>(buffer_pos, alignment); + buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment); } bool invalidate = false; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 411a73d50..96d916b07 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -177,7 +177,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf Memory::GetPointer(framebuffer_addr), gl_framebuffer_data.data(), true); - state.texture_units[0].texture_2d = screen_info.texture.resource.handle; + state.texture_units[0].texture = screen_info.texture.resource.handle; state.Apply(); glActiveTexture(GL_TEXTURE0); @@ -194,7 +194,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); - state.texture_units[0].texture_2d = 0; + state.texture_units[0].texture = 0; state.Apply(); } } @@ -205,7 +205,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf */ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a, const TextureInfo& texture) { - state.texture_units[0].texture_2d = texture.resource.handle; + state.texture_units[0].texture = texture.resource.handle; state.Apply(); glActiveTexture(GL_TEXTURE0); @@ -214,7 +214,7 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color // Update existing texture glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 1, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, framebuffer_data); - state.texture_units[0].texture_2d = 0; + state.texture_units[0].texture = 0; state.Apply(); } @@ -260,7 +260,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Allocation of storage is deferred until the first frame, when we // know the framebuffer size. - state.texture_units[0].texture_2d = screen_info.texture.resource.handle; + state.texture_units[0].texture = screen_info.texture.resource.handle; state.Apply(); glActiveTexture(GL_TEXTURE0); @@ -272,7 +272,7 @@ void RendererOpenGL::InitOpenGLObjects() { screen_info.display_texture = screen_info.texture.resource.handle; - state.texture_units[0].texture_2d = 0; + state.texture_units[0].texture = 0; state.Apply(); // Clear screen to black @@ -305,14 +305,14 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, UNREACHABLE(); } - state.texture_units[0].texture_2d = texture.resource.handle; + state.texture_units[0].texture = texture.resource.handle; state.Apply(); glActiveTexture(GL_TEXTURE0); glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0, texture.gl_format, texture.gl_type, nullptr); - state.texture_units[0].texture_2d = 0; + state.texture_units[0].texture = 0; state.Apply(); } @@ -354,14 +354,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v), }}; - state.texture_units[0].texture_2d = screen_info.display_texture; + state.texture_units[0].texture = screen_info.display_texture; state.texture_units[0].swizzle = {GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA}; state.Apply(); glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices.data()); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); - state.texture_units[0].texture_2d = 0; + state.texture_units[0].texture = 0; state.Apply(); } @@ -369,6 +369,12 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, * Draws the emulated screens to the emulator window. */ void RendererOpenGL::DrawScreen() { + if (renderer_settings.set_background_color) { + // Update background color before drawing + glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, + 0.0f); + } + const auto& layout = render_window.GetFramebufferLayout(); const auto& screen = layout.screen; diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 272294c62..20ba6d4f6 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -46,6 +46,48 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_ } } +template <std::size_t N, std::size_t M> +struct alignas(64) SwizzleTable { + constexpr SwizzleTable() { + for (u32 y = 0; y < N; ++y) { + for (u32 x = 0; x < M; ++x) { + const u32 x2 = x * 16; + values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 + + ((x2 % 32) / 16) * 32 + (y % 2) * 16); + } + } + } + const std::array<u16, M>& operator[](std::size_t index) const { + return values[index]; + } + std::array<std::array<u16, M>, N> values{}; +}; + +constexpr auto swizzle_table = SwizzleTable<8, 4>(); + +void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_data, + u8* unswizzled_data, bool unswizzle, u32 block_height) { + std::array<u8*, 2> data_ptrs; + const std::size_t stride{width * bytes_per_pixel}; + const std::size_t image_width_in_gobs{(stride + 63) / 64}; + const std::size_t copy_size{16}; + for (std::size_t y = 0; y < height; ++y) { + const std::size_t initial_gob = + (y / (8 * block_height)) * 512 * block_height * image_width_in_gobs + + (y % (8 * block_height) / 8) * 512; + const std::size_t pixel_base{y * width * bytes_per_pixel}; + const auto& table = swizzle_table[y % 8]; + for (std::size_t xb = 0; xb < stride; xb += copy_size) { + const std::size_t gob_address{initial_gob + (xb / 64) * 512 * block_height}; + const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]}; + const std::size_t pixel_index{xb + pixel_base}; + data_ptrs[unswizzle] = swizzled_data + swizzle_offset; + data_ptrs[!unswizzle] = unswizzled_data + pixel_index; + std::memcpy(data_ptrs[0], data_ptrs[1], copy_size); + } + } +} + u32 BytesPerPixel(TextureFormat format) { switch (format) { case TextureFormat::DXT1: @@ -63,6 +105,7 @@ u32 BytesPerPixel(TextureFormat format) { case TextureFormat::R32_G32_B32: return 12; case TextureFormat::ASTC_2D_4X4: + case TextureFormat::ASTC_2D_8X8: case TextureFormat::A8R8G8B8: case TextureFormat::A2B10G10R10: case TextureFormat::BF10GF11RF11: @@ -91,8 +134,13 @@ u32 BytesPerPixel(TextureFormat format) { std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width, u32 height, u32 block_height) { std::vector<u8> unswizzled_data(width * height * bytes_per_pixel); - CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel, - Memory::GetPointer(address), unswizzled_data.data(), true, block_height); + if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % 16 == 0) { + FastSwizzleData(width / tile_size, height / tile_size, bytes_per_pixel, + Memory::GetPointer(address), unswizzled_data.data(), true, block_height); + } else { + CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel, + Memory::GetPointer(address), unswizzled_data.data(), true, block_height); + } return unswizzled_data; } @@ -111,6 +159,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat case TextureFormat::BC6H_UF16: case TextureFormat::BC6H_SF16: case TextureFormat::ASTC_2D_4X4: + case TextureFormat::ASTC_2D_8X8: case TextureFormat::A8R8G8B8: case TextureFormat::A2B10G10R10: case TextureFormat::A1B5G5R5: diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index c6bd2f4b9..c2fb824b2 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -170,8 +170,12 @@ struct TICEntry { BitField<0, 16, u32> width_minus_1; BitField<23, 4, TextureType> texture_type; }; - u16 height_minus_1; - INSERT_PADDING_BYTES(10); + union { + BitField<0, 16, u32> height_minus_1; + BitField<16, 15, u32> depth_minus_1; + }; + + INSERT_PADDING_BYTES(8); GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); @@ -192,6 +196,10 @@ struct TICEntry { return height_minus_1 + 1; } + u32 Depth() const { + return depth_minus_1 + 1; + } + u32 BlockHeight() const { ASSERT(header_version == TICHeaderVersion::BlockLinear || header_version == TICHeaderVersion::BlockLinearColorKey); |
