diff options
Diffstat (limited to 'src/video_core')
62 files changed, 1827 insertions, 932 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 63577a9c5..5c8ca429e 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -130,7 +130,9 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_scheduler.cpp renderer_vulkan/vk_scheduler.h renderer_vulkan/vk_stream_buffer.cpp - renderer_vulkan/vk_stream_buffer.h) + renderer_vulkan/vk_stream_buffer.h + renderer_vulkan/vk_swapchain.cpp + renderer_vulkan/vk_swapchain.h) target_include_directories(video_core PRIVATE ../../externals/Vulkan-Headers/include) target_compile_definitions(video_core PRIVATE HAS_VULKAN) @@ -139,4 +141,4 @@ endif() create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PRIVATE glad lz4_static) +target_link_libraries(video_core PRIVATE glad) diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 5ffb492ea..f0ef67535 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -10,7 +10,7 @@ namespace Tegra { void DebugContext::DoOnEvent(Event event, void* data) { { - std::unique_lock<std::mutex> lock(breakpoint_mutex); + std::unique_lock lock{breakpoint_mutex}; // TODO(Subv): Commit the rasterizer's caches so framebuffers, render targets, etc. will // show on debug widgets @@ -32,7 +32,7 @@ void DebugContext::DoOnEvent(Event event, void* data) { void DebugContext::Resume() { { - std::lock_guard<std::mutex> lock(breakpoint_mutex); + std::lock_guard lock{breakpoint_mutex}; // Tell all observers that we are about to resume for (auto& breakpoint_observer : breakpoint_observers) { diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index c235faf46..ac3a2eb01 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -40,7 +40,7 @@ public: /// Constructs the object such that it observes events of the given DebugContext. explicit BreakPointObserver(std::shared_ptr<DebugContext> debug_context) : context_weak(debug_context) { - std::unique_lock<std::mutex> lock(debug_context->breakpoint_mutex); + std::unique_lock lock{debug_context->breakpoint_mutex}; debug_context->breakpoint_observers.push_back(this); } @@ -48,7 +48,7 @@ public: auto context = context_weak.lock(); if (context) { { - std::unique_lock<std::mutex> lock(context->breakpoint_mutex); + std::unique_lock lock{context->breakpoint_mutex}; context->breakpoint_observers.remove(this); } diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 8b1bea1ae..046d047cb 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -8,6 +8,7 @@ #include "video_core/dma_pusher.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" +#include "video_core/memory_manager.h" namespace Tegra { diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 27a36348c..6ab06518f 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -9,7 +9,6 @@ #include "common/bit_field.h" #include "common/common_types.h" -#include "video_core/memory_manager.h" namespace Tegra { diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 03b7ee5d8..55966eef1 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -6,12 +6,13 @@ #include "common/logging/log.h" #include "common/math_util.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" namespace Tegra::Engines { Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) - : memory_manager(memory_manager), rasterizer{rasterizer} {} + : rasterizer{rasterizer}, memory_manager{memory_manager} {} void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { ASSERT_MSG(method_call.method < Regs::NUM_REGS, diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 80523e320..2e51b7f13 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -10,7 +10,10 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" + +namespace Tegra { +class MemoryManager; +} namespace VideoCore { class RasterizerInterface; @@ -115,10 +118,9 @@ public: }; } regs{}; - MemoryManager& memory_manager; - private: VideoCore::RasterizerInterface& rasterizer; + MemoryManager& memory_manager; /// Performs the copy from the source surface to the destination surface as configured in the /// registers. diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 6575afd0f..fb6cdf432 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -9,7 +9,10 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" + +namespace Tegra { +class MemoryManager; +} namespace Tegra::Engines { @@ -40,10 +43,11 @@ public: static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "KeplerCompute Regs has wrong size"); - MemoryManager& memory_manager; - /// Write the value to the register identified by method. void CallMethod(const GPU::MethodCall& method_call); + +private: + MemoryManager& memory_manager; }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 0931b9626..cd51a31d7 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -5,9 +5,9 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/core.h" -#include "core/memory.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" @@ -15,7 +15,7 @@ namespace Tegra::Engines { KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) - : system{system}, memory_manager(memory_manager), rasterizer{rasterizer} {} + : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {} KeplerMemory::~KeplerMemory() = default; @@ -46,7 +46,7 @@ void KeplerMemory::ProcessData(u32 data) { // contain a dirty surface that will have to be written back to memory. const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)}; rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32)); - memory_manager.Write32(address, data); + memory_manager.Write<u32>(address, data); system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 9181e9d80..78b6c3e45 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -10,12 +10,15 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" namespace Core { class System; } +namespace Tegra { +class MemoryManager; +} + namespace VideoCore { class RasterizerInterface; } @@ -82,8 +85,8 @@ public: private: Core::System& system; - MemoryManager& memory_manager; VideoCore::RasterizerInterface& rasterizer; + MemoryManager& memory_manager; void ProcessData(u32 data); }; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index c5d5be4ef..74403eed4 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -7,11 +7,10 @@ #include "common/assert.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/memory.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" -#include "video_core/renderer_base.h" #include "video_core/textures/texture.h" namespace Tegra::Engines { @@ -21,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) - : memory_manager(memory_manager), system{system}, rasterizer{rasterizer}, - macro_interpreter(*this) { + : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{ + *this} { InitializeRegisterDefaults(); } @@ -250,6 +249,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessQueryGet(); break; } + case MAXWELL3D_REG_INDEX(sync_info): { + ProcessSyncPoint(); + break; + } default: break; } @@ -307,7 +310,7 @@ void Maxwell3D::ProcessQueryGet() { // Write the current query sequence to the sequence address. // TODO(Subv): Find out what happens if you use a long query type but mark it as a short // query. - memory_manager.Write32(sequence_address, sequence); + memory_manager.Write<u32>(sequence_address, sequence); } else { // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast // GPU, this command may actually take a while to complete in real hardware due to GPU @@ -327,6 +330,14 @@ void Maxwell3D::ProcessQueryGet() { } } +void Maxwell3D::ProcessSyncPoint() { + const u32 sync_point = regs.sync_info.sync_point.Value(); + const u32 increment = regs.sync_info.increment.Value(); + const u32 cache_flush = regs.sync_info.unknown.Value(); + LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment, + cache_flush); +} + void Maxwell3D::DrawArrays() { LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), regs.vertex_buffer.count); @@ -395,7 +406,7 @@ void Maxwell3D::ProcessCBData(u32 value) { u8* ptr{memory_manager.GetPointer(address)}; rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32)); - memory_manager.Write32(address, value); + memory_manager.Write<u32>(address, value); dirty_flags.OnMemoryWrite(); @@ -447,7 +458,7 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset; current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) { - const Texture::TextureHandle tex_handle{memory_manager.Read32(current_texture)}; + const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(current_texture)}; Texture::FullTextureInfo tex_info{}; // TODO(Subv): Use the shader to determine which textures are actually accessed. @@ -482,7 +493,7 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size); - const Texture::TextureHandle tex_handle{memory_manager.Read32(tex_info_address)}; + const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; Texture::FullTextureInfo tex_info{}; tex_info.index = static_cast<u32>(offset); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 7fbf1026e..321af3297 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -16,13 +16,16 @@ #include "common/math_util.h" #include "video_core/gpu.h" #include "video_core/macro_interpreter.h" -#include "video_core/memory_manager.h" #include "video_core/textures/texture.h" namespace Core { class System; } +namespace Tegra { +class MemoryManager; +} + namespace VideoCore { class RasterizerInterface; } @@ -576,7 +579,17 @@ public: u32 bind; } macros; - INSERT_PADDING_WORDS(0x188); + INSERT_PADDING_WORDS(0x69); + + struct { + union { + BitField<0, 16, u32> sync_point; + BitField<16, 1, u32> unknown; + BitField<20, 1, u32> increment; + }; + } sync_info; + + INSERT_PADDING_WORDS(0x11E); u32 tfb_enabled; @@ -1093,7 +1106,6 @@ public: }; State state{}; - MemoryManager& memory_manager; struct DirtyFlags { std::bitset<8> color_buffer{0xFF}; @@ -1141,6 +1153,8 @@ private: VideoCore::RasterizerInterface& rasterizer; + MemoryManager& memory_manager; + /// Start offsets of each macro in macro_memory std::unordered_map<u32, u32> macro_offsets; @@ -1180,6 +1194,9 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); + /// Handles writes to syncing register. + void ProcessSyncPoint(); + /// Handles a write to the CB_DATA[i] register. void ProcessCBData(u32 value); @@ -1195,6 +1212,7 @@ private: "Field " #field_name " has invalid position") ASSERT_REG_POSITION(macros, 0x45); +ASSERT_REG_POSITION(sync_info, 0xB2); ASSERT_REG_POSITION(tfb_enabled, 0x1D1); ASSERT_REG_POSITION(rt, 0x200); ASSERT_REG_POSITION(viewport_transform, 0x280); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index a0ded4c25..2426d0067 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -5,9 +5,9 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/core.h" -#include "core/memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" #include "video_core/textures/decoders.h" @@ -16,7 +16,7 @@ namespace Tegra::Engines { MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) - : memory_manager(memory_manager), system{system}, rasterizer{rasterizer} {} + : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {} void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { ASSERT_MSG(method_call.method < Regs::NUM_REGS, @@ -88,6 +88,16 @@ void MaxwellDMA::HandleCopy() { auto source_ptr{memory_manager.GetPointer(source)}; auto dst_ptr{memory_manager.GetPointer(dest)}; + if (!source_ptr) { + LOG_ERROR(HW_GPU, "source_ptr is invalid"); + return; + } + + if (!dst_ptr) { + LOG_ERROR(HW_GPU, "dst_ptr is invalid"); + return; + } + const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) { // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated // copying. diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 34c369320..c6b649842 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -10,12 +10,15 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" namespace Core { class System; } +namespace Tegra { +class MemoryManager; +} + namespace VideoCore { class RasterizerInterface; } @@ -139,13 +142,13 @@ public: }; } regs{}; - MemoryManager& memory_manager; - private: Core::System& system; VideoCore::RasterizerInterface& rasterizer; + MemoryManager& memory_manager; + /// Performs the copy from the source buffer to the destination buffer as configured in the /// registers. void HandleCopy(); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 7f613370b..2e1e96c81 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -1238,13 +1238,16 @@ union Instruction { union { BitField<20, 16, u64> imm20_16; + BitField<35, 1, u64> high_b_rr; // used on RR BitField<36, 1, u64> product_shift_left; BitField<37, 1, u64> merge_37; BitField<48, 1, u64> sign_a; BitField<49, 1, u64> sign_b; + BitField<50, 2, XmadMode> mode_cbf; // used by CR, RC BitField<50, 3, XmadMode> mode; BitField<52, 1, u64> high_b; BitField<53, 1, u64> high_a; + BitField<55, 1, u64> product_shift_left_second; // used on CR BitField<56, 1, u64> merge_56; } xmad; @@ -1662,7 +1665,7 @@ private: INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"), INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"), INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"), - INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"), + INST("0011100-01000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"), INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"), INST("0000001---------", Id::LOP3_C, Type::ArithmeticInteger, "LOP3_C"), INST("0101101111100---", Id::LOP3_R, Type::ArithmeticInteger, "LOP3_R"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 66c690494..4461083ff 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -12,6 +12,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/gpu.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_base.h" namespace Tegra { @@ -30,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) { GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} { auto& rasterizer{renderer.Rasterizer()}; - memory_manager = std::make_unique<Tegra::MemoryManager>(); + memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer); dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); @@ -285,9 +286,10 @@ void GPU::ProcessSemaphoreTriggerMethod() { // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of // CoreTiming block.timestamp = Core::System::GetInstance().CoreTiming().GetTicks(); - memory_manager->WriteBlock(regs.smaphore_address.SmaphoreAddress(), &block, sizeof(block)); + memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, + sizeof(block)); } else { - const u32 word{memory_manager->Read32(regs.smaphore_address.SmaphoreAddress())}; + const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())}; if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || (op == GpuSemaphoreOperation::AcquireGequal && static_cast<s32>(word - regs.semaphore_sequence) > 0) || @@ -314,11 +316,11 @@ void GPU::ProcessSemaphoreTriggerMethod() { } void GPU::ProcessSemaphoreRelease() { - memory_manager->Write32(regs.smaphore_address.SmaphoreAddress(), regs.semaphore_release); + memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release); } void GPU::ProcessSemaphoreAcquire() { - const u32 word = memory_manager->Read32(regs.smaphore_address.SmaphoreAddress()); + const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress()); const auto value = regs.semaphore_acquire; if (word != value) { regs.acquire_active = true; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index a14b95c30..de30ea354 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -9,7 +9,6 @@ #include "common/common_types.h" #include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/dma_pusher.h" -#include "video_core/memory_manager.h" using CacheAddr = std::uintptr_t; inline CacheAddr ToCacheAddr(const void* host_ptr) { @@ -124,6 +123,8 @@ enum class EngineID { MAXWELL_DMA_COPY_A = 0xB0B5, }; +class MemoryManager; + class GPU { public: explicit GPU(Core::System& system, VideoCore::RendererBase& renderer); @@ -176,11 +177,11 @@ public: u32 address_high; u32 address_low; - GPUVAddr SmaphoreAddress() const { + GPUVAddr SemaphoreAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } - } smaphore_address; + } semaphore_address; u32 semaphore_sequence; u32 semaphore_trigger; @@ -244,9 +245,8 @@ protected: private: std::unique_ptr<Tegra::MemoryManager> memory_manager; - /// Mapping of command subchannels to their bound engine ids. + /// Mapping of command subchannels to their bound engine ids std::array<EngineID, 8> bound_engines = {}; - /// 3D engine std::unique_ptr<Engines::Maxwell3D> maxwell_3d; /// 2D engine @@ -263,7 +263,7 @@ private: static_assert(offsetof(GPU::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(smaphore_address, 0x4); +ASSERT_REG_POSITION(semaphore_address, 0x4); ASSERT_REG_POSITION(semaphore_sequence, 0x6); ASSERT_REG_POSITION(semaphore_trigger, 0x7); ASSERT_REG_POSITION(reference_count, 0x14); diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index 8b355cf7b..db507cf04 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -9,7 +9,7 @@ namespace VideoCommon { GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer) - : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {} + : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {} GPUAsynch::~GPUAsynch() = default; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 086b2f625..cc56cf467 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,6 +4,9 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "core/core.h" +#include "core/core_timing.h" +#include "core/core_timing_util.h" #include "core/frontend/scope_acquire_window_context.h" #include "video_core/dma_pusher.h" #include "video_core/gpu.h" @@ -36,7 +39,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.DispatchCalls(); } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { - state.DecrementFramesCounter(); renderer.SwapBuffers(std::move(data->framebuffer)); } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) { renderer.Rasterizer().FlushRegion(data->addr, data->size); @@ -47,13 +49,18 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p } else { UNREACHABLE(); } + state.signaled_fence = next.fence; + state.TrySynchronize(); } } } -ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) - : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer), - std::ref(dma_pusher), std::ref(state)} {} +ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, + Tegra::DmaPusher& dma_pusher) + : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} { + synchronization_event = system.CoreTiming().RegisterEvent( + "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); }); +} ThreadManager::~ThreadManager() { // Notify GPU thread that a shutdown is pending @@ -62,14 +69,14 @@ ThreadManager::~ThreadManager() { } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { - PushCommand(SubmitListCommand(std::move(entries))); + const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))}; + const s64 synchronization_ticks{Core::Timing::usToCycles(9000)}; + system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence); } void ThreadManager::SwapBuffers( std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { - state.IncrementFramesCounter(); PushCommand(SwapBuffersCommand(std::move(framebuffer))); - state.WaitForFrames(); } void ThreadManager::FlushRegion(CacheAddr addr, u64 size) { @@ -79,7 +86,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) { void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) { if (state.queue.Empty()) { // It's quicker to invalidate a single region on the CPU if the queue is already empty - renderer.Rasterizer().InvalidateRegion(addr, size); + system.Renderer().Rasterizer().InvalidateRegion(addr, size); } else { PushCommand(InvalidateRegionCommand(addr, size)); } @@ -90,9 +97,25 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { InvalidateRegion(addr, size); } -void ThreadManager::PushCommand(CommandData&& command_data) { - state.queue.Push(CommandDataContainer(std::move(command_data))); +u64 ThreadManager::PushCommand(CommandData&& command_data) { + const u64 fence{++state.last_fence}; + state.queue.Push(CommandDataContainer(std::move(command_data), fence)); state.SignalCommands(); + return fence; +} + +MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); +void SynchState::WaitForSynchronization(u64 fence) { + if (signaled_fence >= fence) { + return; + } + + // Wait for the GPU to be idle (all commands to be executed) + { + MICROPROFILE_SCOPE(GPU_wait); + std::unique_lock<std::mutex> lock{synchronization_mutex}; + synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; }); + } } } // namespace VideoCommon::GPUThread diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 8cd7db1c6..62bcea5bb 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -4,10 +4,8 @@ #pragma once -#include <array> #include <atomic> #include <condition_variable> -#include <memory> #include <mutex> #include <optional> #include <thread> @@ -21,9 +19,12 @@ struct FramebufferConfig; class DmaPusher; } // namespace Tegra -namespace VideoCore { -class RendererBase; -} // namespace VideoCore +namespace Core { +class System; +namespace Timing { +struct EventType; +} // namespace Timing +} // namespace Core namespace VideoCommon::GPUThread { @@ -77,81 +78,68 @@ using CommandData = struct CommandDataContainer { CommandDataContainer() = default; - CommandDataContainer(CommandData&& data) : data{std::move(data)} {} + CommandDataContainer(CommandData&& data, u64 next_fence) + : data{std::move(data)}, fence{next_fence} {} CommandDataContainer& operator=(const CommandDataContainer& t) { data = std::move(t.data); + fence = t.fence; return *this; } CommandData data; + u64 fence{}; }; /// Struct used to synchronize the GPU thread struct SynchState final { std::atomic_bool is_running{true}; std::atomic_int queued_frame_count{}; - std::mutex frames_mutex; + std::mutex synchronization_mutex; std::mutex commands_mutex; std::condition_variable commands_condition; - std::condition_variable frames_condition; + std::condition_variable synchronization_condition; - void IncrementFramesCounter() { - std::lock_guard<std::mutex> lock{frames_mutex}; - ++queued_frame_count; + /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU + /// synchronized. This is entirely empirical. + bool IsSynchronized() const { + constexpr std::size_t max_queue_gap{5}; + return queue.Size() <= max_queue_gap; } - void DecrementFramesCounter() { - { - std::lock_guard<std::mutex> lock{frames_mutex}; - --queued_frame_count; - - if (queued_frame_count) { - return; - } + void TrySynchronize() { + if (IsSynchronized()) { + std::lock_guard<std::mutex> lock{synchronization_mutex}; + synchronization_condition.notify_one(); } - frames_condition.notify_one(); } - void WaitForFrames() { - { - std::lock_guard<std::mutex> lock{frames_mutex}; - if (!queued_frame_count) { - return; - } - } - - // Wait for the GPU to be idle (all commands to be executed) - { - std::unique_lock<std::mutex> lock{frames_mutex}; - frames_condition.wait(lock, [this] { return !queued_frame_count; }); - } - } + void WaitForSynchronization(u64 fence); void SignalCommands() { - { - std::unique_lock<std::mutex> lock{commands_mutex}; - if (queue.Empty()) { - return; - } + if (queue.Empty()) { + return; } commands_condition.notify_one(); } void WaitForCommands() { - std::unique_lock<std::mutex> lock{commands_mutex}; + std::unique_lock lock{commands_mutex}; commands_condition.wait(lock, [this] { return !queue.Empty(); }); } using CommandQueue = Common::SPSCQueue<CommandDataContainer>; CommandQueue queue; + u64 last_fence{}; + std::atomic<u64> signaled_fence{}; }; /// Class used to manage the GPU thread class ThreadManager final { public: - explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher); + explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, + Tegra::DmaPusher& dma_pusher); ~ThreadManager(); /// Push GPU command entries to be processed @@ -172,12 +160,12 @@ public: private: /// Pushes a command to be executed by the GPU thread - void PushCommand(CommandData&& command_data); + u64 PushCommand(CommandData&& command_data); private: SynchState state; - VideoCore::RendererBase& renderer; - Tegra::DmaPusher& dma_pusher; + Core::System& system; + Core::Timing::EventType* synchronization_event{}; std::thread thread; std::thread::id thread_id; }; diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp index 64f75db43..524d9ea5a 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro_interpreter.cpp @@ -223,27 +223,21 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res } u32 MacroInterpreter::FetchParameter() { - ASSERT(next_parameter_index < parameters.size()); - return parameters[next_parameter_index++]; + return parameters.at(next_parameter_index++); } u32 MacroInterpreter::GetRegister(u32 register_id) const { - // Register 0 is supposed to always return 0. - if (register_id == 0) - return 0; - - ASSERT(register_id < registers.size()); - return registers[register_id]; + return registers.at(register_id); } void MacroInterpreter::SetRegister(u32 register_id, u32 value) { - // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero - // register. - if (register_id == 0) + // Register 0 is hardwired as the zero register. + // Ensure no writes to it actually occur. + if (register_id == 0) { return; + } - ASSERT(register_id < registers.size()); - registers[register_id] = value; + registers.at(register_id) = value; } void MacroInterpreter::SetMethodAddress(u32 address) { diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 8e8f36f28..0f4e820aa 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -7,216 +7,526 @@ #include "common/logging/log.h" #include "core/memory.h" #include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" namespace Tegra { -MemoryManager::MemoryManager() { - // Mark the first page as reserved, so that 0 is not a valid GPUVAddr. Otherwise, games might - // try to use 0 as a valid address, which is also used to mean nullptr. This fixes a bug with - // Undertale using 0 for a render target. - PageSlot(0) = static_cast<u64>(PageStatus::Reserved); -} - -GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) { - const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, align, PageStatus::Unmapped)}; +MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} { + std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr); + std::fill(page_table.attributes.begin(), page_table.attributes.end(), + Common::PageType::Unmapped); + page_table.Resize(address_space_width); - ASSERT_MSG(gpu_addr, "unable to find available GPU memory"); + // Initialize the map with a single free region covering the entire managed space. + VirtualMemoryArea initial_vma; + initial_vma.size = address_space_end; + vma_map.emplace(initial_vma.base, initial_vma); - for (u64 offset{}; offset < size; offset += PAGE_SIZE) { - VAddr& slot{PageSlot(*gpu_addr + offset)}; + UpdatePageTableForVMA(initial_vma); +} - ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); +GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) { + const u64 aligned_size{Common::AlignUp(size, page_size)}; + const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)}; - slot = static_cast<u64>(PageStatus::Allocated); - } + AllocateMemory(gpu_addr, 0, aligned_size); - return *gpu_addr; + return gpu_addr; } GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) { - for (u64 offset{}; offset < size; offset += PAGE_SIZE) { - VAddr& slot{PageSlot(gpu_addr + offset)}; - - ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); + const u64 aligned_size{Common::AlignUp(size, page_size)}; - slot = static_cast<u64>(PageStatus::Allocated); - } + AllocateMemory(gpu_addr, 0, aligned_size); return gpu_addr; } GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) { - const std::optional<GPUVAddr> gpu_addr{FindFreeBlock(0, size, PAGE_SIZE, PageStatus::Unmapped)}; + const u64 aligned_size{Common::AlignUp(size, page_size)}; + const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)}; - ASSERT_MSG(gpu_addr, "unable to find available GPU memory"); + MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr); - for (u64 offset{}; offset < size; offset += PAGE_SIZE) { - VAddr& slot{PageSlot(*gpu_addr + offset)}; + return gpu_addr; +} - ASSERT(slot == static_cast<u64>(PageStatus::Unmapped)); +GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) { + ASSERT((gpu_addr & page_mask) == 0); - slot = cpu_addr + offset; - } + const u64 aligned_size{Common::AlignUp(size, page_size)}; - const MappedRegion region{cpu_addr, *gpu_addr, size}; - mapped_regions.push_back(region); + MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr); - return *gpu_addr; + return gpu_addr; } -GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) { - ASSERT((gpu_addr & PAGE_MASK) == 0); +GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { + ASSERT((gpu_addr & page_mask) == 0); - if (PageSlot(gpu_addr) != static_cast<u64>(PageStatus::Allocated)) { - // Page has been already mapped. In this case, we must find a new area of memory to use that - // is different than the specified one. Super Mario Odyssey hits this scenario when changing - // areas, but we do not want to overwrite the old pages. - // TODO(bunnei): We need to write a hardware test to confirm this behavior. + const u64 aligned_size{Common::AlignUp(size, page_size)}; + const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))}; - LOG_ERROR(HW_GPU, "attempting to map addr 0x{:016X}, which is not available!", gpu_addr); + rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size); + UnmapRange(gpu_addr, aligned_size); - const std::optional<GPUVAddr> new_gpu_addr{ - FindFreeBlock(gpu_addr, size, PAGE_SIZE, PageStatus::Allocated)}; + return gpu_addr; +} + +GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) const { + // Find the first Free VMA. + const VMAHandle vma_handle{ + std::find_if(vma_map.begin(), vma_map.end(), [region_start, size](const auto& vma) { + if (vma.second.type != VirtualMemoryArea::Type::Unmapped) { + return false; + } - ASSERT_MSG(new_gpu_addr, "unable to find available GPU memory"); + const VAddr vma_end{vma.second.base + vma.second.size}; + return vma_end > region_start && vma_end >= region_start + size; + })}; - gpu_addr = *new_gpu_addr; + if (vma_handle == vma_map.end()) { + return {}; } - for (u64 offset{}; offset < size; offset += PAGE_SIZE) { - VAddr& slot{PageSlot(gpu_addr + offset)}; + return std::max(region_start, vma_handle->second.base); +} - ASSERT(slot == static_cast<u64>(PageStatus::Allocated)); +bool MemoryManager::IsAddressValid(GPUVAddr addr) const { + return (addr >> page_bits) < page_table.pointers.size(); +} - slot = cpu_addr + offset; +std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) const { + if (!IsAddressValid(addr)) { + return {}; } - const MappedRegion region{cpu_addr, gpu_addr, size}; - mapped_regions.push_back(region); + const VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]}; + if (cpu_addr) { + return cpu_addr + (addr & page_mask); + } - return gpu_addr; + return {}; } -GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { - ASSERT((gpu_addr & PAGE_MASK) == 0); +template <typename T> +T MemoryManager::Read(GPUVAddr addr) const { + if (!IsAddressValid(addr)) { + return {}; + } - for (u64 offset{}; offset < size; offset += PAGE_SIZE) { - VAddr& slot{PageSlot(gpu_addr + offset)}; + const u8* page_pointer{page_table.pointers[addr >> page_bits]}; + if (page_pointer) { + // NOTE: Avoid adding any extra logic to this fast-path block + T value; + std::memcpy(&value, &page_pointer[addr & page_mask], sizeof(T)); + return value; + } - ASSERT(slot != static_cast<u64>(PageStatus::Allocated) && - slot != static_cast<u64>(PageStatus::Unmapped)); + switch (page_table.attributes[addr >> page_bits]) { + case Common::PageType::Unmapped: + LOG_ERROR(HW_GPU, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, addr); + return 0; + case Common::PageType::Memory: + ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr); + break; + default: + UNREACHABLE(); + } + return {}; +} - slot = static_cast<u64>(PageStatus::Unmapped); +template <typename T> +void MemoryManager::Write(GPUVAddr addr, T data) { + if (!IsAddressValid(addr)) { + return; } - // Delete the region mappings that are contained within the unmapped region - mapped_regions.erase(std::remove_if(mapped_regions.begin(), mapped_regions.end(), - [&](const MappedRegion& region) { - return region.gpu_addr <= gpu_addr && - region.gpu_addr + region.size < gpu_addr + size; - }), - mapped_regions.end()); - return gpu_addr; + u8* page_pointer{page_table.pointers[addr >> page_bits]}; + if (page_pointer) { + // NOTE: Avoid adding any extra logic to this fast-path block + std::memcpy(&page_pointer[addr & page_mask], &data, sizeof(T)); + return; + } + + switch (page_table.attributes[addr >> page_bits]) { + case Common::PageType::Unmapped: + LOG_ERROR(HW_GPU, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8, + static_cast<u32>(data), addr); + return; + case Common::PageType::Memory: + ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr); + break; + default: + UNREACHABLE(); + } } -GPUVAddr MemoryManager::GetRegionEnd(GPUVAddr region_start) const { - for (const auto& region : mapped_regions) { - const GPUVAddr region_end{region.gpu_addr + region.size}; - if (region_start >= region.gpu_addr && region_start < region_end) { - return region_end; - } +template u8 MemoryManager::Read<u8>(GPUVAddr addr) const; +template u16 MemoryManager::Read<u16>(GPUVAddr addr) const; +template u32 MemoryManager::Read<u32>(GPUVAddr addr) const; +template u64 MemoryManager::Read<u64>(GPUVAddr addr) const; +template void MemoryManager::Write<u8>(GPUVAddr addr, u8 data); +template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data); +template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data); +template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data); + +u8* MemoryManager::GetPointer(GPUVAddr addr) { + if (!IsAddressValid(addr)) { + return {}; } + + u8* const page_pointer{page_table.pointers[addr >> page_bits]}; + if (page_pointer != nullptr) { + return page_pointer + (addr & page_mask); + } + + LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr); return {}; } -std::optional<GPUVAddr> MemoryManager::FindFreeBlock(GPUVAddr region_start, u64 size, u64 align, - PageStatus status) { - GPUVAddr gpu_addr{region_start}; - u64 free_space{}; - align = (align + PAGE_MASK) & ~PAGE_MASK; +const u8* MemoryManager::GetPointer(GPUVAddr addr) const { + if (!IsAddressValid(addr)) { + return {}; + } - while (gpu_addr + free_space < MAX_ADDRESS) { - if (PageSlot(gpu_addr + free_space) == static_cast<u64>(status)) { - free_space += PAGE_SIZE; - if (free_space >= size) { - return gpu_addr; - } - } else { - gpu_addr += free_space + PAGE_SIZE; - free_space = 0; - gpu_addr = Common::AlignUp(gpu_addr, align); - } + const u8* const page_pointer{page_table.pointers[addr >> page_bits]}; + if (page_pointer != nullptr) { + return page_pointer + (addr & page_mask); } + LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr); return {}; } -std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) { - const VAddr base_addr{PageSlot(gpu_addr)}; +void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const { + std::size_t remaining_size{size}; + std::size_t page_index{src_addr >> page_bits}; + std::size_t page_offset{src_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + + switch (page_table.attributes[page_index]) { + case Common::PageType::Memory: { + const u8* src_ptr{page_table.pointers[page_index] + page_offset}; + rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); + std::memcpy(dest_buffer, src_ptr, copy_amount); + break; + } + default: + UNREACHABLE(); + } - if (base_addr == static_cast<u64>(PageStatus::Allocated) || - base_addr == static_cast<u64>(PageStatus::Unmapped) || - base_addr == static_cast<u64>(PageStatus::Reserved)) { - return {}; + page_index++; + page_offset = 0; + dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; + remaining_size -= copy_amount; } +} - return base_addr + (gpu_addr & PAGE_MASK); +void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) { + std::size_t remaining_size{size}; + std::size_t page_index{dest_addr >> page_bits}; + std::size_t page_offset{dest_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + + switch (page_table.attributes[page_index]) { + case Common::PageType::Memory: { + u8* dest_ptr{page_table.pointers[page_index] + page_offset}; + rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount); + std::memcpy(dest_ptr, src_buffer, copy_amount); + break; + } + default: + UNREACHABLE(); + } + + page_index++; + page_offset = 0; + src_buffer = static_cast<const u8*>(src_buffer) + copy_amount; + remaining_size -= copy_amount; + } } -u8 MemoryManager::Read8(GPUVAddr addr) { - return Memory::Read8(*GpuToCpuAddress(addr)); +void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) { + std::size_t remaining_size{size}; + std::size_t page_index{src_addr >> page_bits}; + std::size_t page_offset{src_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + + switch (page_table.attributes[page_index]) { + case Common::PageType::Memory: { + const u8* src_ptr{page_table.pointers[page_index] + page_offset}; + rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); + WriteBlock(dest_addr, src_ptr, copy_amount); + break; + } + default: + UNREACHABLE(); + } + + page_index++; + page_offset = 0; + dest_addr += static_cast<VAddr>(copy_amount); + src_addr += static_cast<VAddr>(copy_amount); + remaining_size -= copy_amount; + } } -u16 MemoryManager::Read16(GPUVAddr addr) { - return Memory::Read16(*GpuToCpuAddress(addr)); +void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, + VAddr backing_addr) { + LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size, + (base + size) * page_size); + + const VAddr end{base + size}; + ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}", + base + page_table.pointers.size()); + + std::fill(page_table.attributes.begin() + base, page_table.attributes.begin() + end, type); + + if (memory == nullptr) { + std::fill(page_table.pointers.begin() + base, page_table.pointers.begin() + end, memory); + std::fill(page_table.backing_addr.begin() + base, page_table.backing_addr.begin() + end, + backing_addr); + } else { + while (base != end) { + page_table.pointers[base] = memory; + page_table.backing_addr[base] = backing_addr; + + base += 1; + memory += page_size; + backing_addr += page_size; + } + } } -u32 MemoryManager::Read32(GPUVAddr addr) { - return Memory::Read32(*GpuToCpuAddress(addr)); +void MemoryManager::MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr) { + ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size); + ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base); + MapPages(base / page_size, size / page_size, target, Common::PageType::Memory, backing_addr); } -u64 MemoryManager::Read64(GPUVAddr addr) { - return Memory::Read64(*GpuToCpuAddress(addr)); +void MemoryManager::UnmapRegion(GPUVAddr base, u64 size) { + ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size); + ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base); + MapPages(base / page_size, size / page_size, nullptr, Common::PageType::Unmapped); } -void MemoryManager::Write8(GPUVAddr addr, u8 data) { - Memory::Write8(*GpuToCpuAddress(addr), data); +bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const { + ASSERT(base + size == next.base); + if (type != next.type) { + return {}; + } + if (type == VirtualMemoryArea::Type::Allocated && (offset + size != next.offset)) { + return {}; + } + if (type == VirtualMemoryArea::Type::Mapped && backing_memory + size != next.backing_memory) { + return {}; + } + return true; } -void MemoryManager::Write16(GPUVAddr addr, u16 data) { - Memory::Write16(*GpuToCpuAddress(addr), data); +MemoryManager::VMAHandle MemoryManager::FindVMA(GPUVAddr target) const { + if (target >= address_space_end) { + return vma_map.end(); + } else { + return std::prev(vma_map.upper_bound(target)); + } } -void MemoryManager::Write32(GPUVAddr addr, u32 data) { - Memory::Write32(*GpuToCpuAddress(addr), data); +MemoryManager::VMAIter MemoryManager::Allocate(VMAIter vma_handle) { + VirtualMemoryArea& vma{vma_handle->second}; + + vma.type = VirtualMemoryArea::Type::Allocated; + vma.backing_addr = 0; + vma.backing_memory = {}; + UpdatePageTableForVMA(vma); + + return MergeAdjacent(vma_handle); } -void MemoryManager::Write64(GPUVAddr addr, u64 data) { - Memory::Write64(*GpuToCpuAddress(addr), data); +MemoryManager::VMAHandle MemoryManager::AllocateMemory(GPUVAddr target, std::size_t offset, + u64 size) { + + // This is the appropriately sized VMA that will turn into our allocation. + VMAIter vma_handle{CarveVMA(target, size)}; + VirtualMemoryArea& vma{vma_handle->second}; + + ASSERT(vma.size == size); + + vma.offset = offset; + + return Allocate(vma_handle); } -u8* MemoryManager::GetPointer(GPUVAddr addr) { - return Memory::GetPointer(*GpuToCpuAddress(addr)); +MemoryManager::VMAHandle MemoryManager::MapBackingMemory(GPUVAddr target, u8* memory, u64 size, + VAddr backing_addr) { + // This is the appropriately sized VMA that will turn into our allocation. + VMAIter vma_handle{CarveVMA(target, size)}; + VirtualMemoryArea& vma{vma_handle->second}; + + ASSERT(vma.size == size); + + vma.type = VirtualMemoryArea::Type::Mapped; + vma.backing_memory = memory; + vma.backing_addr = backing_addr; + UpdatePageTableForVMA(vma); + + return MergeAdjacent(vma_handle); } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) { - std::memcpy(dest_buffer, GetPointer(src_addr), size); +void MemoryManager::UnmapRange(GPUVAddr target, u64 size) { + VMAIter vma{CarveVMARange(target, size)}; + const VAddr target_end{target + size}; + const VMAIter end{vma_map.end()}; + + // The comparison against the end of the range must be done using addresses since VMAs can be + // merged during this process, causing invalidation of the iterators. + while (vma != end && vma->second.base < target_end) { + // Unmapped ranges return to allocated state and can be reused + // This behavior is used by Super Mario Odyssey, Sonic Forces, and likely other games + vma = std::next(Allocate(vma)); + } + + ASSERT(FindVMA(target)->second.size >= size); } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) { - std::memcpy(GetPointer(dest_addr), src_buffer, size); + +MemoryManager::VMAIter MemoryManager::StripIterConstness(const VMAHandle& iter) { + // This uses a neat C++ trick to convert a const_iterator to a regular iterator, given + // non-const access to its container. + return vma_map.erase(iter, iter); // Erases an empty range of elements } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) { - std::memcpy(GetPointer(dest_addr), GetPointer(src_addr), size); +MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) { + ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size); + ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: 0x{:016X}", base); + + VMAIter vma_handle{StripIterConstness(FindVMA(base))}; + if (vma_handle == vma_map.end()) { + // Target address is outside the managed range + return {}; + } + + const VirtualMemoryArea& vma{vma_handle->second}; + if (vma.type == VirtualMemoryArea::Type::Mapped) { + // Region is already allocated + return vma_handle; + } + + const VAddr start_in_vma{base - vma.base}; + const VAddr end_in_vma{start_in_vma + size}; + + ASSERT_MSG(end_in_vma <= vma.size, "region size 0x{:016X} is less than required size 0x{:016X}", + vma.size, end_in_vma); + + if (end_in_vma < vma.size) { + // Split VMA at the end of the allocated region + SplitVMA(vma_handle, end_in_vma); + } + if (start_in_vma != 0) { + // Split VMA at the start of the allocated region + vma_handle = SplitVMA(vma_handle, start_in_vma); + } + + return vma_handle; +} + +MemoryManager::VMAIter MemoryManager::CarveVMARange(GPUVAddr target, u64 size) { + ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size); + ASSERT_MSG((target & page_mask) == 0, "non-page aligned base: 0x{:016X}", target); + + const VAddr target_end{target + size}; + ASSERT(target_end >= target); + ASSERT(size > 0); + + VMAIter begin_vma{StripIterConstness(FindVMA(target))}; + const VMAIter i_end{vma_map.lower_bound(target_end)}; + if (std::any_of(begin_vma, i_end, [](const auto& entry) { + return entry.second.type == VirtualMemoryArea::Type::Unmapped; + })) { + return {}; + } + + if (target != begin_vma->second.base) { + begin_vma = SplitVMA(begin_vma, target - begin_vma->second.base); + } + + VMAIter end_vma{StripIterConstness(FindVMA(target_end))}; + if (end_vma != vma_map.end() && target_end != end_vma->second.base) { + end_vma = SplitVMA(end_vma, target_end - end_vma->second.base); + } + + return begin_vma; +} + +MemoryManager::VMAIter MemoryManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) { + VirtualMemoryArea& old_vma{vma_handle->second}; + VirtualMemoryArea new_vma{old_vma}; // Make a copy of the VMA + + // For now, don't allow no-op VMA splits (trying to split at a boundary) because it's probably + // a bug. This restriction might be removed later. + ASSERT(offset_in_vma < old_vma.size); + ASSERT(offset_in_vma > 0); + + old_vma.size = offset_in_vma; + new_vma.base += offset_in_vma; + new_vma.size -= offset_in_vma; + + switch (new_vma.type) { + case VirtualMemoryArea::Type::Unmapped: + break; + case VirtualMemoryArea::Type::Allocated: + new_vma.offset += offset_in_vma; + break; + case VirtualMemoryArea::Type::Mapped: + new_vma.backing_memory += offset_in_vma; + break; + } + + ASSERT(old_vma.CanBeMergedWith(new_vma)); + + return vma_map.emplace_hint(std::next(vma_handle), new_vma.base, new_vma); +} + +MemoryManager::VMAIter MemoryManager::MergeAdjacent(VMAIter iter) { + const VMAIter next_vma{std::next(iter)}; + if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) { + iter->second.size += next_vma->second.size; + vma_map.erase(next_vma); + } + + if (iter != vma_map.begin()) { + VMAIter prev_vma{std::prev(iter)}; + if (prev_vma->second.CanBeMergedWith(iter->second)) { + prev_vma->second.size += iter->second.size; + vma_map.erase(iter); + iter = prev_vma; + } + } + + return iter; } -VAddr& MemoryManager::PageSlot(GPUVAddr gpu_addr) { - auto& block{page_table[(gpu_addr >> (PAGE_BITS + PAGE_TABLE_BITS)) & PAGE_TABLE_MASK]}; - if (!block) { - block = std::make_unique<PageBlock>(); - block->fill(static_cast<VAddr>(PageStatus::Unmapped)); +void MemoryManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) { + switch (vma.type) { + case VirtualMemoryArea::Type::Unmapped: + UnmapRegion(vma.base, vma.size); + break; + case VirtualMemoryArea::Type::Allocated: + MapMemoryRegion(vma.base, vma.size, nullptr, vma.backing_addr); + break; + case VirtualMemoryArea::Type::Mapped: + MapMemoryRegion(vma.base, vma.size, vma.backing_memory, vma.backing_addr); + break; } - return (*block)[(gpu_addr >> PAGE_BITS) & PAGE_BLOCK_MASK]; } } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 425e2f31c..647cbf93a 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -1,82 +1,154 @@ -// Copyright 2018 yuzu emulator team +// Copyright 2018 yuzu emulator team // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once -#include <array> -#include <memory> +#include <map> #include <optional> -#include <vector> #include "common/common_types.h" +#include "common/page_table.h" + +namespace VideoCore { +class RasterizerInterface; +} namespace Tegra { -/// Virtual addresses in the GPU's memory map are 64 bit. -using GPUVAddr = u64; +/** + * Represents a VMA in an address space. A VMA is a contiguous region of virtual addressing space + * with homogeneous attributes across its extents. In this particular implementation each VMA is + * also backed by a single host memory allocation. + */ +struct VirtualMemoryArea { + enum class Type : u8 { + Unmapped, + Allocated, + Mapped, + }; + + /// Virtual base address of the region. + GPUVAddr base{}; + /// Size of the region. + u64 size{}; + /// Memory area mapping type. + Type type{Type::Unmapped}; + /// CPU memory mapped address corresponding to this memory area. + VAddr backing_addr{}; + /// Offset into the backing_memory the mapping starts from. + std::size_t offset{}; + /// Pointer backing this VMA. + u8* backing_memory{}; + + /// Tests if this area can be merged to the right with `next`. + bool CanBeMergedWith(const VirtualMemoryArea& next) const; +}; class MemoryManager final { public: - MemoryManager(); + MemoryManager(VideoCore::RasterizerInterface& rasterizer); GPUVAddr AllocateSpace(u64 size, u64 align); - GPUVAddr AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align); + GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align); GPUVAddr MapBufferEx(VAddr cpu_addr, u64 size); - GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size); - GPUVAddr UnmapBuffer(GPUVAddr gpu_addr, u64 size); - GPUVAddr GetRegionEnd(GPUVAddr region_start) const; - std::optional<VAddr> GpuToCpuAddress(GPUVAddr gpu_addr); - - static constexpr u64 PAGE_BITS = 16; - static constexpr u64 PAGE_SIZE = 1 << PAGE_BITS; - static constexpr u64 PAGE_MASK = PAGE_SIZE - 1; + GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr addr, u64 size); + GPUVAddr UnmapBuffer(GPUVAddr addr, u64 size); + std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; - u8 Read8(GPUVAddr addr); - u16 Read16(GPUVAddr addr); - u32 Read32(GPUVAddr addr); - u64 Read64(GPUVAddr addr); + template <typename T> + T Read(GPUVAddr addr) const; - void Write8(GPUVAddr addr, u8 data); - void Write16(GPUVAddr addr, u16 data); - void Write32(GPUVAddr addr, u32 data); - void Write64(GPUVAddr addr, u64 data); + template <typename T> + void Write(GPUVAddr addr, T data); - u8* GetPointer(GPUVAddr vaddr); + u8* GetPointer(GPUVAddr addr); + const u8* GetPointer(GPUVAddr addr) const; - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size); + void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(VAddr dest_addr, VAddr src_addr, std::size_t size); + void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); private: - enum class PageStatus : u64 { - Unmapped = 0xFFFFFFFFFFFFFFFFULL, - Allocated = 0xFFFFFFFFFFFFFFFEULL, - Reserved = 0xFFFFFFFFFFFFFFFDULL, - }; - - std::optional<GPUVAddr> FindFreeBlock(GPUVAddr region_start, u64 size, u64 align, - PageStatus status); - VAddr& PageSlot(GPUVAddr gpu_addr); - - static constexpr u64 MAX_ADDRESS{0x10000000000ULL}; - static constexpr u64 PAGE_TABLE_BITS{10}; - static constexpr u64 PAGE_TABLE_SIZE{1 << PAGE_TABLE_BITS}; - static constexpr u64 PAGE_TABLE_MASK{PAGE_TABLE_SIZE - 1}; - static constexpr u64 PAGE_BLOCK_BITS{14}; - static constexpr u64 PAGE_BLOCK_SIZE{1 << PAGE_BLOCK_BITS}; - static constexpr u64 PAGE_BLOCK_MASK{PAGE_BLOCK_SIZE - 1}; - - using PageBlock = std::array<VAddr, PAGE_BLOCK_SIZE>; - std::array<std::unique_ptr<PageBlock>, PAGE_TABLE_SIZE> page_table{}; - - struct MappedRegion { - VAddr cpu_addr; - GPUVAddr gpu_addr; - u64 size; - }; + using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>; + using VMAHandle = VMAMap::const_iterator; + using VMAIter = VMAMap::iterator; + + bool IsAddressValid(GPUVAddr addr) const; + void MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, + VAddr backing_addr = 0); + void MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr); + void UnmapRegion(GPUVAddr base, u64 size); + + /// Finds the VMA in which the given address is included in, or `vma_map.end()`. + VMAHandle FindVMA(GPUVAddr target) const; + + VMAHandle AllocateMemory(GPUVAddr target, std::size_t offset, u64 size); + + /** + * Maps an unmanaged host memory pointer at a given address. + * + * @param target The guest address to start the mapping at. + * @param memory The memory to be mapped. + * @param size Size of the mapping. + * @param state MemoryState tag to attach to the VMA. + */ + VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr); + + /// Unmaps a range of addresses, splitting VMAs as necessary. + void UnmapRange(GPUVAddr target, u64 size); + + /// Converts a VMAHandle to a mutable VMAIter. + VMAIter StripIterConstness(const VMAHandle& iter); + + /// Marks as the specfied VMA as allocated. + VMAIter Allocate(VMAIter vma); + + /** + * Carves a VMA of a specific size at the specified address by splitting Free VMAs while doing + * the appropriate error checking. + */ + VMAIter CarveVMA(GPUVAddr base, u64 size); + + /** + * Splits the edges of the given range of non-Free VMAs so that there is a VMA split at each + * end of the range. + */ + VMAIter CarveVMARange(GPUVAddr base, u64 size); + + /** + * Splits a VMA in two, at the specified offset. + * @returns the right side of the split, with the original iterator becoming the left side. + */ + VMAIter SplitVMA(VMAIter vma, u64 offset_in_vma); + + /** + * Checks for and merges the specified VMA with adjacent ones if possible. + * @returns the merged VMA or the original if no merging was possible. + */ + VMAIter MergeAdjacent(VMAIter vma); + + /// Updates the pages corresponding to this VMA so they match the VMA's attributes. + void UpdatePageTableForVMA(const VirtualMemoryArea& vma); + + /// Finds a free (unmapped region) of the specified size starting at the specified address. + GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size) const; - std::vector<MappedRegion> mapped_regions; +private: + static constexpr u64 page_bits{16}; + static constexpr u64 page_size{1 << page_bits}; + static constexpr u64 page_mask{page_size - 1}; + + /// Address space in bits, this is fairly arbitrary but sufficiently large. + static constexpr u32 address_space_width{39}; + /// Start address for mapping, this is fairly arbitrary but must be non-zero. + static constexpr GPUVAddr address_space_base{0x100000}; + /// End of address space, based on address space in bits. + static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; + + Common::PageTable page_table{page_bits}; + VMAMap vma_map; + VideoCore::RasterizerInterface& rasterizer; }; } // namespace Tegra diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h index ecd9986a0..291772186 100644 --- a/src/video_core/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache.h @@ -71,8 +71,8 @@ private: bool is_registered{}; ///< Whether the object is currently registered with the cache bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing - CacheAddr cache_addr{}; ///< Cache address memory, unique from emulated virtual address space const u8* host_ptr{}; ///< Pointer to the memory backing this cached region + CacheAddr cache_addr{}; ///< Cache address memory, unique from emulated virtual address space }; template <class T> @@ -84,7 +84,7 @@ public: /// Write any cached resources overlapping the specified region back to memory void FlushRegion(CacheAddr addr, std::size_t size) { - std::lock_guard<std::recursive_mutex> lock{mutex}; + std::lock_guard lock{mutex}; const auto& objects{GetSortedObjectsFromRegion(addr, size)}; for (auto& object : objects) { @@ -94,7 +94,7 @@ public: /// Mark the specified region as being invalidated void InvalidateRegion(CacheAddr addr, u64 size) { - std::lock_guard<std::recursive_mutex> lock{mutex}; + std::lock_guard lock{mutex}; const auto& objects{GetSortedObjectsFromRegion(addr, size)}; for (auto& object : objects) { @@ -108,7 +108,7 @@ public: /// Invalidates everything in the cache void InvalidateAll() { - std::lock_guard<std::recursive_mutex> lock{mutex}; + std::lock_guard lock{mutex}; while (interval_cache.begin() != interval_cache.end()) { Unregister(*interval_cache.begin()->second.begin()); @@ -132,8 +132,8 @@ protected: } /// Register an object into the cache - void Register(const T& object) { - std::lock_guard<std::recursive_mutex> lock{mutex}; + virtual void Register(const T& object) { + std::lock_guard lock{mutex}; object->SetIsRegistered(true); interval_cache.add({GetInterval(object), ObjectSet{object}}); @@ -142,8 +142,8 @@ protected: } /// Unregisters an object from the cache - void Unregister(const T& object) { - std::lock_guard<std::recursive_mutex> lock{mutex}; + virtual void Unregister(const T& object) { + std::lock_guard lock{mutex}; object->SetIsRegistered(false); rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); @@ -153,14 +153,14 @@ protected: /// Returns a ticks counter used for tracking when cached objects were last modified u64 GetModifiedTicks() { - std::lock_guard<std::recursive_mutex> lock{mutex}; + std::lock_guard lock{mutex}; return ++modified_ticks; } /// Flushes the specified object, updating appropriate cache state as needed void FlushObject(const T& object) { - std::lock_guard<std::recursive_mutex> lock{mutex}; + std::lock_guard lock{mutex}; if (!object->IsDirty()) { return; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 76e292e87..d7b86df38 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -9,7 +9,6 @@ #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" namespace VideoCore { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 5048ed6ce..25652e794 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -7,7 +7,7 @@ #include "common/alignment.h" #include "core/core.h" -#include "core/memory.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" @@ -15,14 +15,14 @@ namespace OpenGL { CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, std::size_t alignment, u8* host_ptr) - : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{ - host_ptr} {} + : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset}, + alignment{alignment} {} OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) : RasterizerCache{rasterizer}, stream_buffer(size, true) {} -GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, - std::size_t alignment, bool cache) { +GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment, + bool cache) { auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); // Cache management is a big overhead, so only cache entries with a given size. diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 1de1f84ae..fc33aa433 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -58,7 +58,7 @@ public: /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been /// allocated. - GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, + GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, bool cache = true); /// Uploads from a host memory. Returns host's buffer offset where it's been allocated. diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp index c8dbcacbd..8d9ee81f1 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ b/src/video_core/renderer_opengl/gl_global_cache.cpp @@ -4,9 +4,9 @@ #include <glad/glad.h> -#include "common/assert.h" #include "common/logging/log.h" #include "core/core.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_global_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" @@ -15,7 +15,7 @@ namespace OpenGL { CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr) - : cpu_addr{cpu_addr}, size{size}, RasterizerCacheObject{host_ptr} { + : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} { buffer.Create(); // Bind and unbind the buffer so it gets allocated by the driver glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); @@ -46,7 +46,7 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, return search->second; } -GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(Tegra::GPUVAddr addr, u32 size, +GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr) { GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)}; if (!region) { @@ -76,8 +76,8 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]}; const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address + global_region.GetCbufOffset()}; - const auto actual_addr{memory_manager.Read64(addr)}; - const auto size{memory_manager.Read32(addr + 8)}; + const auto actual_addr{memory_manager.Read<u64>(addr)}; + const auto size{memory_manager.Read<u32>(addr + 8)}; // Look up global region in the cache based on address const auto& host_ptr{memory_manager.GetPointer(actual_addr)}; diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h index a840491f7..5a21ab66f 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.h +++ b/src/video_core/renderer_opengl/gl_global_cache.h @@ -66,7 +66,7 @@ public: private: GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const; - GlobalRegion GetUncachedGlobalRegion(Tegra::GPUVAddr addr, u32 size, u8* host_ptr); + GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr); void ReserveGlobalRegion(GlobalRegion region); std::unordered_map<CacheAddr, GlobalRegion> reserve; diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp index 75d816795..c3e94d917 100644 --- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp +++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp @@ -7,7 +7,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "core/core.h" -#include "core/memory.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_primitive_assembler.h" @@ -40,8 +40,7 @@ GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) { return index_offset; } -GLintptr PrimitiveAssembler::MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size, - u32 count) { +GLintptr PrimitiveAssembler::MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count) { const std::size_t map_size{CalculateQuadSize(count)}; auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size); diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h index a8cb88eb5..4e87ce4d6 100644 --- a/src/video_core/renderer_opengl/gl_primitive_assembler.h +++ b/src/video_core/renderer_opengl/gl_primitive_assembler.h @@ -4,11 +4,9 @@ #pragma once -#include <vector> #include <glad/glad.h> #include "common/common_types.h" -#include "video_core/memory_manager.h" namespace OpenGL { @@ -24,7 +22,7 @@ public: GLintptr MakeQuadArray(u32 first, u32 count); - GLintptr MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size, u32 count); + GLintptr MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count); private: OGLBufferCache& buffer_cache; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 198c54872..d250d5cbb 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -17,7 +17,6 @@ #include "common/microprofile.h" #include "common/scope_exit.h" #include "core/core.h" -#include "core/frontend/emu_window.h" #include "core/hle/kernel/process.h" #include "core/settings.h" #include "video_core/engines/maxwell_3d.h" @@ -26,7 +25,6 @@ #include "video_core/renderer_opengl/gl_shader_gen.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" -#include "video_core/video_core.h" namespace OpenGL { @@ -100,11 +98,9 @@ struct FramebufferCacheKey { } }; -RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system, - ScreenInfo& info) - : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, - emu_window{window}, system{system}, screen_info{info}, - buffer_cache(*this, STREAM_BUFFER_SIZE) { +RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) + : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system}, + screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) { // Create sampler objects for (std::size_t i = 0; i < texture_samplers.size(); ++i) { texture_samplers[i].Create(); @@ -225,8 +221,8 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { if (!vertex_array.IsEnabled()) continue; - const Tegra::GPUVAddr start = vertex_array.StartAddress(); - const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); + const GPUVAddr start = vertex_array.StartAddress(); + const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); ASSERT(end > start); const u64 size = end - start + 1; @@ -303,6 +299,10 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { BaseBindings base_bindings; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + // Prepare packed bindings + bind_ubo_pushbuffer.Setup(base_bindings.cbuf); + bind_ssbo_pushbuffer.Setup(base_bindings.gmem); + for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -320,13 +320,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5 GLShader::MaxwellUniformData ubo{}; - ubo.SetFromRegs(gpu.state.shader_stages[stage]); + ubo.SetFromRegs(gpu, stage); const GLintptr offset = buffer_cache.UploadHostMemory( &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); // Bind the emulation info buffer - glBindBufferRange(GL_UNIFORM_BUFFER, base_bindings.cbuf, buffer_cache.GetHandle(), offset, - static_cast<GLsizeiptr>(sizeof(ubo))); + bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, + static_cast<GLsizeiptr>(sizeof(ubo))); Shader shader{shader_cache.GetStageProgram(program)}; const auto [program_handle, next_bindings] = @@ -370,6 +370,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { base_bindings = next_bindings; } + bind_ubo_pushbuffer.Bind(); + bind_ssbo_pushbuffer.Bind(); + SyncClipEnabled(clip_distances); gpu.dirty_flags.shaders = false; @@ -421,8 +424,8 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { if (!regs.vertex_array[index].IsEnabled()) continue; - const Tegra::GPUVAddr start = regs.vertex_array[index].StartAddress(); - const Tegra::GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); + const GPUVAddr start = regs.vertex_array[index].StartAddress(); + const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); ASSERT(end > start); size += end - start + 1; @@ -904,23 +907,14 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)]; const auto& entries = shader->GetShaderEntries().const_buffers; - constexpr u64 max_binds = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers; - std::array<GLuint, max_binds> bind_buffers; - std::array<GLintptr, max_binds> bind_offsets; - std::array<GLsizeiptr, max_binds> bind_sizes; - - ASSERT_MSG(entries.size() <= max_binds, "Exceeded expected number of binding points."); - // Upload only the enabled buffers from the 16 constbuffers of each shader stage for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { const auto& used_buffer = entries[bindpoint]; const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()]; if (!buffer.enabled) { - // With disabled buffers set values as zero to unbind them - bind_buffers[bindpoint] = 0; - bind_offsets[bindpoint] = 0; - bind_sizes[bindpoint] = 0; + // Set values to zero to unbind buffers + bind_ubo_pushbuffer.Push(0, 0, 0); continue; } @@ -948,30 +942,19 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader const GLintptr const_buffer_offset = buffer_cache.UploadMemory( buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); - // Prepare values for multibind - bind_buffers[bindpoint] = buffer_cache.GetHandle(); - bind_offsets[bindpoint] = const_buffer_offset; - bind_sizes[bindpoint] = size; + bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size); } - - // The first binding is reserved for emulation values - const GLuint ubo_base_binding = base_bindings.cbuf + 1; - glBindBuffersRange(GL_UNIFORM_BUFFER, ubo_base_binding, static_cast<GLsizei>(entries.size()), - bind_buffers.data(), bind_offsets.data(), bind_sizes.data()); } void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader, GLenum primitive_mode, BaseBindings base_bindings) { - // TODO(Rodrigo): Use ARB_multi_bind here const auto& entries = shader->GetShaderEntries().global_memory_entries; - - for (u32 bindpoint = 0; bindpoint < static_cast<u32>(entries.size()); ++bindpoint) { - const auto& entry = entries[bindpoint]; - const u32 current_bindpoint = base_bindings.gmem + bindpoint; - const auto& region = global_cache.GetGlobalRegion(entry, stage); - - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, current_bindpoint, region->GetBufferHandle()); + for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { + const auto& entry{entries[bindpoint]}; + const auto& region{global_cache.GetGlobalRegion(entry, stage)}; + bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0, + static_cast<GLsizeiptr>(region->GetSizeInBytes())); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 30f3e8acb..e4c64ae71 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -12,15 +12,12 @@ #include <optional> #include <tuple> #include <utility> -#include <vector> #include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range.hpp> #include <glad/glad.h> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" #include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" @@ -29,10 +26,9 @@ #include "video_core/renderer_opengl/gl_rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_cache.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" +#include "video_core/renderer_opengl/utils.h" namespace Core { class System; @@ -50,8 +46,7 @@ struct FramebufferCacheKey; class RasterizerOpenGL : public VideoCore::RasterizerInterface { public: - explicit RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system, - ScreenInfo& info); + explicit RasterizerOpenGL(Core::System& system, ScreenInfo& info); ~RasterizerOpenGL() override; void DrawArrays() override; @@ -214,7 +209,6 @@ private: ShaderCacheOpenGL shader_cache; GlobalRegionCacheOpenGL global_cache; - Core::Frontend::EmuWindow& emu_window; Core::System& system; ScreenInfo& screen_info; @@ -236,6 +230,9 @@ private: PrimitiveAssembler primitive_assembler{buffer_cache}; GLint uniform_buffer_alignment; + BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; + BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + std::size_t CalculateVertexArraysSize() const; std::size_t CalculateIndexBufferSize() const; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 57329cd61..9026a9452 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -13,9 +13,9 @@ #include "common/scope_exit.h" #include "core/core.h" #include "core/hle/kernel/process.h" -#include "core/memory.h" #include "core/settings.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" @@ -55,7 +55,7 @@ static void ApplyTextureDefaults(GLuint texture, u32 max_mip_level) { } } -void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) { +void SurfaceParams::InitCacheParameters(GPUVAddr gpu_addr_) { auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; gpu_addr = gpu_addr_; @@ -222,7 +222,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only, } /*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer( - u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format, + u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format, u32 block_width, u32 block_height, u32 block_depth, Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) { SurfaceParams params{}; @@ -266,6 +266,10 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only, params.component_type = ComponentTypeFromRenderTarget(config.format); params.type = GetFormatType(params.pixel_format); params.width = config.width; + if (!params.is_tiled) { + const u32 bpp = params.GetFormatBpp() / 8; + params.pitch = config.width * bpp; + } params.height = config.height; params.unaligned_height = config.height; params.target = SurfaceTarget::Texture2D; @@ -562,8 +566,14 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac } CachedSurface::CachedSurface(const SurfaceParams& params) - : params{params}, gl_target{SurfaceTargetToGL(params.target)}, - cached_size_in_bytes{params.size_in_bytes}, RasterizerCacheObject{params.host_ptr} { + : RasterizerCacheObject{params.host_ptr}, params{params}, + gl_target{SurfaceTargetToGL(params.target)}, cached_size_in_bytes{params.size_in_bytes} { + + const auto optional_cpu_addr{ + Core::System::GetInstance().GPU().MemoryManager().GpuToCpuAddress(params.gpu_addr)}; + ASSERT_MSG(optional_cpu_addr, "optional_cpu_addr is invalid"); + cpu_addr = *optional_cpu_addr; + texture.Create(gl_target); // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0) @@ -603,20 +613,6 @@ CachedSurface::CachedSurface(const SurfaceParams& params) ApplyTextureDefaults(texture.handle, params.max_mip_level); OpenGL::LabelGLObject(GL_TEXTURE, texture.handle, params.gpu_addr, params.IdentityString()); - - // Clamp size to mapped GPU memory region - // TODO(bunnei): Super Mario Odyssey maps a 0x40000 byte region and then uses it for a 0x80000 - // R32F render buffer. We do not yet know if this is a game bug or something else, but this - // check is necessary to prevent flushing from overwriting unmapped memory. - - auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - const u64 max_size{memory_manager.GetRegionEnd(params.gpu_addr) - params.gpu_addr}; - if (cached_size_in_bytes > max_size) { - LOG_ERROR(HW_GPU, "Surface size {} exceeds region size {}", params.size_in_bytes, max_size); - cached_size_in_bytes = max_size; - } - - cpu_addr = *memory_manager.GpuToCpuAddress(params.gpu_addr); } MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64)); @@ -670,8 +666,8 @@ void CachedSurface::FlushGLBuffer() { gl_buffer[0].resize(GetSizeInBytes()); const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - // Ensure no bad interactions with GL_UNPACK_ALIGNMENT - ASSERT(params.width * GetBytesPerPixel(params.pixel_format) % 4 == 0); + const u32 align = std::clamp(params.RowAlign(0), 1U, 8U); + glPixelStorei(GL_PACK_ALIGNMENT, align); glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width)); ASSERT(!tuple.compressed); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -716,8 +712,8 @@ void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type); - // Ensure no bad interactions with GL_UNPACK_ALIGNMENT - ASSERT(params.MipWidth(mip_map) * GetBytesPerPixel(params.pixel_format) % 4 == 0); + const u32 align = std::clamp(params.RowAlign(mip_map), 1U, 8U); + glPixelStorei(GL_UNPACK_ALIGNMENT, align); glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.MipWidth(mip_map))); const auto image_size = static_cast<GLsizei>(params.GetMipmapSizeGL(mip_map, false)); @@ -925,7 +921,7 @@ void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) { } Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) { - if (params.gpu_addr == 0 || params.height * params.width == 0) { + if (!params.IsValid()) { return {}; } @@ -941,7 +937,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres // If surface parameters changed and we care about keeping the previous data, recreate // the surface from the old one Surface new_surface{RecreateSurface(surface, params)}; - UnregisterSurface(surface); + Unregister(surface); Register(new_surface); if (new_surface->IsUploaded()) { RegisterReinterpretSurface(new_surface); @@ -949,7 +945,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres return new_surface; } else { // Delete the old surface before creating a new one to prevent collisions. - UnregisterSurface(surface); + Unregister(surface); } } @@ -980,11 +976,11 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface, const auto& init_params{src_surface->GetSurfaceParams()}; const auto& dst_params{dst_surface->GetSurfaceParams()}; auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - Tegra::GPUVAddr address{init_params.gpu_addr}; + GPUVAddr address{init_params.gpu_addr}; const std::size_t layer_size{dst_params.LayerMemorySize()}; for (u32 layer = 0; layer < dst_params.depth; layer++) { for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) { - const Tegra::GPUVAddr sub_address{address + dst_params.GetMipmapLevelOffset(mipmap)}; + const GPUVAddr sub_address{address + dst_params.GetMipmapLevelOffset(mipmap)}; const Surface& copy{TryGet(memory_manager.GetPointer(sub_address))}; if (!copy) { continue; @@ -1244,10 +1240,9 @@ static std::optional<u32> TryFindBestMipMap(std::size_t memory, const SurfacePar return {}; } -static std::optional<u32> TryFindBestLayer(Tegra::GPUVAddr addr, const SurfaceParams params, - u32 mipmap) { +static std::optional<u32> TryFindBestLayer(GPUVAddr addr, const SurfaceParams params, u32 mipmap) { const std::size_t size{params.LayerMemorySize()}; - Tegra::GPUVAddr start{params.gpu_addr + params.GetMipmapLevelOffset(mipmap)}; + GPUVAddr start{params.gpu_addr + params.GetMipmapLevelOffset(mipmap)}; for (u32 i = 0; i < params.depth; i++) { if (start == addr) { return {i}; @@ -1304,12 +1299,12 @@ static bool IsReinterpretInvalidSecond(const Surface render_surface, bool RasterizerCacheOpenGL::PartialReinterpretSurface(Surface triggering_surface, Surface intersect) { if (IsReinterpretInvalid(triggering_surface, intersect)) { - UnregisterSurface(intersect); + Unregister(intersect); return false; } if (!LayerFitReinterpretSurface(*this, triggering_surface, intersect)) { if (IsReinterpretInvalidSecond(triggering_surface, intersect)) { - UnregisterSurface(intersect); + Unregister(intersect); return false; } FlushObject(intersect); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 9366f47f2..db280dbb3 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -5,13 +5,13 @@ #pragma once #include <array> -#include <map> #include <memory> #include <string> -#include <unordered_set> +#include <tuple> #include <vector> #include "common/alignment.h" +#include "common/bit_util.h" #include "common/common_types.h" #include "common/hash.h" #include "common/math_util.h" @@ -109,6 +109,11 @@ struct SurfaceParams { return size; } + /// Returns true if the parameters constitute a valid rasterizer surface. + bool IsValid() const { + return gpu_addr && host_ptr && height && width; + } + /// Returns the exact size of the memory occupied by a layer in a texture in VRAM, including /// mipmaps. std::size_t LayerMemorySize() const { @@ -201,6 +206,13 @@ struct SurfaceParams { return bd; } + u32 RowAlign(u32 mip_level) const { + const u32 m_width = MipWidth(mip_level); + const u32 bytes_per_pixel = GetBytesPerPixel(pixel_format); + const u32 l2 = Common::CountTrailingZeroes32(m_width * bytes_per_pixel); + return (1U << l2); + } + /// Creates SurfaceParams from a texture configuration static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config, const GLShader::SamplerEntry& entry); @@ -210,7 +222,7 @@ struct SurfaceParams { /// Creates SurfaceParams for a depth buffer configuration static SurfaceParams CreateForDepthBuffer( - u32 zeta_width, u32 zeta_height, Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format, + u32 zeta_width, u32 zeta_height, GPUVAddr zeta_address, Tegra::DepthFormat format, u32 block_width, u32 block_height, u32 block_depth, Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type); @@ -232,7 +244,7 @@ struct SurfaceParams { } /// Initializes parameters for caching, should be called after everything has been initialized - void InitCacheParameters(Tegra::GPUVAddr gpu_addr); + void InitCacheParameters(GPUVAddr gpu_addr); std::string TargetName() const { switch (target) { @@ -297,7 +309,7 @@ struct SurfaceParams { bool srgb_conversion; // Parameters used for caching u8* host_ptr; - Tegra::GPUVAddr gpu_addr; + GPUVAddr gpu_addr; std::size_t size_in_bytes; std::size_t size_in_bytes_gl; @@ -533,13 +545,17 @@ private: return nullptr; } + void Register(const Surface& object) override { + RasterizerCache<Surface>::Register(object); + } + /// Unregisters an object from the cache - void UnregisterSurface(const Surface& object) { + void Unregister(const Surface& object) override { if (object->IsReinterpreted()) { auto interval = GetReinterpretInterval(object); reinterpreted_surfaces.erase(interval); } - Unregister(object); + RasterizerCache<Surface>::Unregister(object); } }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 1ed740877..99f67494c 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -6,13 +6,12 @@ #include "common/assert.h" #include "common/hash.h" #include "core/core.h" -#include "core/memory.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" -#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/utils.h" #include "video_core/shader/shader_ir.h" @@ -32,7 +31,7 @@ struct UnspecializedShader { namespace { /// Gets the address for the specified shader stage program -Tegra::GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { +GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { const auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()}; const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; return gpu.regs.code_address.CodeAddress() + shader_config.offset; @@ -41,6 +40,10 @@ Tegra::GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { /// Gets the shader program code from memory for the specified address ProgramCode GetShaderCode(const u8* host_ptr) { ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); + ASSERT_OR_EXECUTE(host_ptr != nullptr, { + std::fill(program_code.begin(), program_code.end(), 0); + return program_code; + }); std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64)); return program_code; } @@ -215,9 +218,9 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) - : host_ptr{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, - program_type{program_type}, disk_cache{disk_cache}, - precompiled_programs{precompiled_programs}, RasterizerCacheObject{host_ptr} { + : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr}, + unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, + precompiled_programs{precompiled_programs} { const std::size_t code_size = CalculateProgramSize(program_code); const std::size_t code_size_b = @@ -245,9 +248,9 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, GLShader::ProgramResult result, u8* host_ptr) - : cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, program_type{program_type}, - disk_cache{disk_cache}, precompiled_programs{precompiled_programs}, RasterizerCacheObject{ - host_ptr} { + : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, + program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{ + precompiled_programs} { code = std::move(result.first); entries = result.second; @@ -486,7 +489,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { } auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()}; - const Tegra::GPUVAddr program_addr{GetShaderAddress(program)}; + const GPUVAddr program_addr{GetShaderAddress(program)}; // Look up shader in the cache based on address const auto& host_ptr{memory_manager.GetPointer(program_addr)}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index fd1c85115..0cf8e0b3d 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -5,21 +5,20 @@ #pragma once #include <array> +#include <atomic> #include <memory> #include <set> #include <tuple> #include <unordered_map> +#include <vector> #include <glad/glad.h> -#include "common/assert.h" #include "common/common_types.h" #include "video_core/rasterizer_cache.h" -#include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" namespace Core { class System; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 11d1169f0..28e490b3c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -21,6 +21,8 @@ namespace OpenGL::GLShader { +namespace { + using Tegra::Shader::Attribute; using Tegra::Shader::AttributeUse; using Tegra::Shader::Header; @@ -34,14 +36,18 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage; using Operation = const OperationNode&; +enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; + +struct TextureAoffi {}; +using TextureArgument = std::pair<Type, Node>; +using TextureIR = std::variant<TextureAoffi, TextureArgument>; + enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 }; constexpr u32 MAX_CONSTBUFFER_ELEMENTS = static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float)); constexpr u32 MAX_GLOBALMEMORY_ELEMENTS = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float); -enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat }; - class ShaderWriter { public: void AddExpression(std::string_view text) { @@ -69,10 +75,10 @@ public: shader_source += '\n'; } - std::string GenerateTemporal() { - std::string temporal = "tmp"; - temporal += std::to_string(temporal_index++); - return temporal; + std::string GenerateTemporary() { + std::string temporary = "tmp"; + temporary += std::to_string(temporary_index++); + return temporary; } std::string GetResult() { @@ -87,11 +93,11 @@ private: } std::string shader_source; - u32 temporal_index = 1; + u32 temporary_index = 1; }; /// Generates code to use for a swizzle operation. -static std::string GetSwizzle(u32 elem) { +std::string GetSwizzle(u32 elem) { ASSERT(elem <= 3); std::string swizzle = "."; swizzle += "xyzw"[elem]; @@ -99,7 +105,7 @@ static std::string GetSwizzle(u32 elem) { } /// Translate topology -static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { +std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { switch (topology) { case Tegra::Shader::OutputTopology::PointList: return "points"; @@ -114,7 +120,7 @@ static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { } /// Returns true if an object has to be treated as precise -static bool IsPrecise(Operation operand) { +bool IsPrecise(Operation operand) { const auto& meta = operand.GetMeta(); if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) { @@ -126,7 +132,7 @@ static bool IsPrecise(Operation operand) { return false; } -static bool IsPrecise(Node node) { +bool IsPrecise(Node node) { if (const auto operation = std::get_if<OperationNode>(node)) { return IsPrecise(*operation); } @@ -426,9 +432,14 @@ private: std::string Visit(Node node) { if (const auto operation = std::get_if<OperationNode>(node)) { const auto operation_index = static_cast<std::size_t>(operation->GetCode()); + if (operation_index >= operation_decompilers.size()) { + UNREACHABLE_MSG("Out of bounds operation: {}", operation_index); + return {}; + } const auto decompiler = operation_decompilers[operation_index]; if (decompiler == nullptr) { - UNREACHABLE_MSG("Operation decompiler {} not defined", operation_index); + UNREACHABLE_MSG("Undefined operation: {}", operation_index); + return {}; } return (this->*decompiler)(*operation); @@ -540,9 +551,8 @@ private: } else if (std::holds_alternative<OperationNode>(*offset)) { // Indirect access - const std::string final_offset = code.GenerateTemporal(); - code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4) & " + - std::to_string(MAX_CONSTBUFFER_ELEMENTS - 1) + ';'); + const std::string final_offset = code.GenerateTemporary(); + code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4);"); return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()), final_offset, final_offset); @@ -587,9 +597,9 @@ private: // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders const std::string precise = stage != ShaderStage::Fragment ? "precise " : ""; - const std::string temporal = code.GenerateTemporal(); - code.AddLine(precise + "float " + temporal + " = " + value + ';'); - return temporal; + const std::string temporary = code.GenerateTemporary(); + code.AddLine(precise + "float " + temporary + " = " + value + ';'); + return temporary; } std::string VisitOperand(Operation operation, std::size_t operand_index) { @@ -601,9 +611,9 @@ private: return Visit(operand); } - const std::string temporal = code.GenerateTemporal(); - code.AddLine("float " + temporal + " = " + Visit(operand) + ';'); - return temporal; + const std::string temporary = code.GenerateTemporary(); + code.AddLine("float " + temporary + " = " + Visit(operand) + ';'); + return temporary; } std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) { @@ -718,8 +728,8 @@ private: result_type)); } - std::string GenerateTexture(Operation operation, const std::string& func, - const std::vector<std::pair<Type, Node>>& extras) { + std::string GenerateTexture(Operation operation, const std::string& function_suffix, + const std::vector<TextureIR>& extras) { constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"}; const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); @@ -729,11 +739,11 @@ private: const bool has_array = meta->sampler.IsArray(); const bool has_shadow = meta->sampler.IsShadow(); - std::string expr = func; - expr += '('; - expr += GetSampler(meta->sampler); - expr += ", "; - + std::string expr = "texture" + function_suffix; + if (!meta->aoffi.empty()) { + expr += "Offset"; + } + expr += '(' + GetSampler(meta->sampler) + ", "; expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { @@ -751,36 +761,74 @@ private: } expr += ')'; - for (const auto& extra_pair : extras) { - const auto [type, operand] = extra_pair; - if (operand == nullptr) { - continue; + for (const auto& variant : extras) { + if (const auto argument = std::get_if<TextureArgument>(&variant)) { + expr += GenerateTextureArgument(*argument); + } else if (std::get_if<TextureAoffi>(&variant)) { + expr += GenerateTextureAoffi(meta->aoffi); + } else { + UNREACHABLE(); } - expr += ", "; + } - switch (type) { - case Type::Int: - if (const auto immediate = std::get_if<ImmediateNode>(operand)) { - // Inline the string as an immediate integer in GLSL (some extra arguments are - // required to be constant) - expr += std::to_string(static_cast<s32>(immediate->GetValue())); - } else { - expr += "ftoi(" + Visit(operand) + ')'; - } - break; - case Type::Float: - expr += Visit(operand); - break; - default: { - const auto type_int = static_cast<u32>(type); - UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int); - expr += '0'; - break; + return expr + ')'; + } + + std::string GenerateTextureArgument(TextureArgument argument) { + const auto [type, operand] = argument; + if (operand == nullptr) { + return {}; + } + + std::string expr = ", "; + switch (type) { + case Type::Int: + if (const auto immediate = std::get_if<ImmediateNode>(operand)) { + // Inline the string as an immediate integer in GLSL (some extra arguments are + // required to be constant) + expr += std::to_string(static_cast<s32>(immediate->GetValue())); + } else { + expr += "ftoi(" + Visit(operand) + ')'; } + break; + case Type::Float: + expr += Visit(operand); + break; + default: { + const auto type_int = static_cast<u32>(type); + UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int); + expr += '0'; + break; + } + } + return expr; + } + + std::string GenerateTextureAoffi(const std::vector<Node>& aoffi) { + if (aoffi.empty()) { + return {}; + } + constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"}; + std::string expr = ", "; + expr += coord_constructors.at(aoffi.size() - 1); + expr += '('; + + for (std::size_t index = 0; index < aoffi.size(); ++index) { + const auto operand{aoffi.at(index)}; + if (const auto immediate = std::get_if<ImmediateNode>(operand)) { + // Inline the string as an immediate integer in GLSL (AOFFI arguments are required + // to be constant by the standard). + expr += std::to_string(static_cast<s32>(immediate->GetValue())); + } else { + expr += "ftoi(" + Visit(operand) + ')'; + } + if (index + 1 < aoffi.size()) { + expr += ", "; } } + expr += ')'; - return expr + ')'; + return expr; } std::string Assign(Operation operation) { @@ -1159,7 +1207,8 @@ private: const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture(operation, "texture", {{Type::Float, meta->bias}}); + std::string expr = GenerateTexture( + operation, "", {TextureAoffi{}, TextureArgument{Type::Float, meta->bias}}); if (meta->sampler.IsShadow()) { expr = "vec4(" + expr + ')'; } @@ -1170,7 +1219,8 @@ private: const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture(operation, "textureLod", {{Type::Float, meta->lod}}); + std::string expr = GenerateTexture( + operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureAoffi{}}); if (meta->sampler.IsShadow()) { expr = "vec4(" + expr + ')'; } @@ -1182,7 +1232,8 @@ private: ASSERT(meta); const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int; - return GenerateTexture(operation, "textureGather", {{type, meta->component}}) + + return GenerateTexture(operation, "Gather", + {TextureArgument{type, meta->component}, TextureAoffi{}}) + GetSwizzle(meta->element); } @@ -1196,11 +1247,12 @@ private: switch (meta->element) { case 0: case 1: - return "textureSize(" + sampler + ", " + lod + ')' + GetSwizzle(meta->element); + return "itof(int(textureSize(" + sampler + ", " + lod + ')' + + GetSwizzle(meta->element) + "))"; case 2: return "0"; case 3: - return "textureQueryLevels(" + sampler + ')'; + return "itof(textureQueryLevels(" + sampler + "))"; } UNREACHABLE(); return "0"; @@ -1211,8 +1263,8 @@ private: ASSERT(meta); if (meta->element < 2) { - return "itof(int((" + GenerateTexture(operation, "textureQueryLod", {}) + - " * vec2(256))" + GetSwizzle(meta->element) + "))"; + return "itof(int((" + GenerateTexture(operation, "QueryLod", {}) + " * vec2(256))" + + GetSwizzle(meta->element) + "))"; } return "0"; } @@ -1565,6 +1617,8 @@ private: ShaderWriter code; }; +} // Anonymous namespace + std::string GetCommonDeclarations() { const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS); const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 72aca4938..4e04ab2f8 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -5,7 +5,6 @@ #pragma once #include <array> -#include <set> #include <string> #include <utility> #include <vector> diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 82fc4d44b..8a43eb157 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -4,7 +4,6 @@ #include <cstring> #include <fmt/format.h> -#include <lz4.h> #include "common/assert.h" #include "common/common_paths.h" @@ -12,6 +11,7 @@ #include "common/file_util.h" #include "common/logging/log.h" #include "common/scm_rev.h" +#include "common/zstd_compression.h" #include "core/core.h" #include "core/hle/kernel/process.h" @@ -49,39 +49,6 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { return hash; } -template <typename T> -std::vector<u8> CompressData(const T* source, std::size_t source_size) { - if (source_size > LZ4_MAX_INPUT_SIZE) { - // Source size exceeds LZ4 maximum input size - return {}; - } - const auto source_size_int = static_cast<int>(source_size); - const int max_compressed_size = LZ4_compressBound(source_size_int); - std::vector<u8> compressed(max_compressed_size); - const int compressed_size = LZ4_compress_default(reinterpret_cast<const char*>(source), - reinterpret_cast<char*>(compressed.data()), - source_size_int, max_compressed_size); - if (compressed_size <= 0) { - // Compression failed - return {}; - } - compressed.resize(compressed_size); - return compressed; -} - -std::vector<u8> DecompressData(const std::vector<u8>& compressed, std::size_t uncompressed_size) { - std::vector<u8> uncompressed(uncompressed_size); - const int size_check = LZ4_decompress_safe(reinterpret_cast<const char*>(compressed.data()), - reinterpret_cast<char*>(uncompressed.data()), - static_cast<int>(compressed.size()), - static_cast<int>(uncompressed.size())); - if (static_cast<int>(uncompressed_size) != size_check) { - // Decompression failed - return {}; - } - return uncompressed; -} - } // namespace ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type, @@ -292,7 +259,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { return {}; } - dump.binary = DecompressData(compressed_binary, binary_length); + dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary); if (dump.binary.empty()) { return {}; } @@ -321,7 +288,7 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn return {}; } - const std::vector<u8> code = DecompressData(compressed_code, code_size); + const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code); if (code.empty()) { return {}; } @@ -507,7 +474,8 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str if (!IsUsable()) return; - const std::vector<u8> compressed_code{CompressData(code.data(), code.size())}; + const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault( + reinterpret_cast<const u8*>(code.data()), code.size())}; if (compressed_code.empty()) { LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}", unique_identifier); @@ -537,7 +505,9 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p std::vector<u8> binary(binary_length); glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); - const std::vector<u8> compressed_binary = CompressData(binary.data(), binary.size()); + const std::vector<u8> compressed_binary = + Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size()); + if (compressed_binary.empty()) { LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}", usage.unique_identifier); diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 7d96649af..8763d9c71 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -3,7 +3,6 @@ // Refer to the license.txt file included. #include <fmt/format.h> -#include "common/assert.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_gen.h" diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index fba8e681b..fad346b48 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -4,12 +4,9 @@ #pragma once -#include <array> -#include <string> #include <vector> #include "common/common_types.h" -#include "video_core/engines/shader_bytecode.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/shader/shader_ir.h" diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 6a30c28d2..eaf3e03a0 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -2,15 +2,15 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include "core/core.h" #include "video_core/renderer_opengl/gl_shader_manager.h" namespace OpenGL::GLShader { -void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) { - const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D(); - const auto& regs = gpu.regs; - const auto& state = gpu.state; +using Tegra::Engines::Maxwell3D; + +void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shader_stage) { + const auto& regs = maxwell.regs; + const auto& state = maxwell.state; // TODO(bunnei): Support more than one viewport viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f; @@ -18,7 +18,7 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh u32 func = static_cast<u32>(regs.alpha_test_func); // Normalize the gl variants of opCompare to be the same as the normal variants - u32 op_gl_variant_base = static_cast<u32>(Tegra::Engines::Maxwell3D::Regs::ComparisonOp::Never); + const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never); if (func >= op_gl_variant_base) { func = func - op_gl_variant_base + 1U; } @@ -31,8 +31,9 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh // Assign in which stage the position has to be flipped // (the last stage before the fragment shader). - if (gpu.regs.shader_config[static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry)].enable) { - flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry); + constexpr u32 geometry_index = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry); + if (maxwell.regs.shader_config[geometry_index].enable) { + flip_stage = geometry_index; } else { flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB); } diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 4970aafed..37dcfefdb 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -12,14 +12,13 @@ namespace OpenGL::GLShader { -using Tegra::Engines::Maxwell3D; - /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned -// NOTE: Always keep a vec4 at the end. The GL spec is not clear whether the alignment at -// the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not. -// Not following that rule will cause problems on some AMD drivers. +/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at +/// the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not. +/// Not following that rule will cause problems on some AMD drivers. struct MaxwellUniformData { - void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage); + void SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell, std::size_t shader_stage); + alignas(16) GLvec4 viewport_flip; struct alignas(16) { GLuint instance_id; @@ -63,7 +62,6 @@ public: UpdatePipeline(); state.draw.shader_program = 0; state.draw.program_pipeline = pipeline.handle; - state.geometry_shaders.enabled = (gs != 0); } private: diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 9419326a3..52d569a1b 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -10,16 +10,62 @@ namespace OpenGL { -OpenGLState OpenGLState::cur_state; +using Maxwell = Tegra::Engines::Maxwell3D::Regs; +OpenGLState OpenGLState::cur_state; bool OpenGLState::s_rgb_used; +namespace { + +template <typename T> +bool UpdateValue(T& current_value, const T new_value) { + const bool changed = current_value != new_value; + current_value = new_value; + return changed; +} + +template <typename T1, typename T2> +bool UpdateTie(T1 current_value, const T2 new_value) { + const bool changed = current_value != new_value; + current_value = new_value; + return changed; +} + +void Enable(GLenum cap, bool enable) { + if (enable) { + glEnable(cap); + } else { + glDisable(cap); + } +} + +void Enable(GLenum cap, GLuint index, bool enable) { + if (enable) { + glEnablei(cap, index); + } else { + glDisablei(cap, index); + } +} + +void Enable(GLenum cap, bool& current_value, bool new_value) { + if (UpdateValue(current_value, new_value)) + Enable(cap, new_value); +} + +void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) { + if (UpdateValue(current_value, new_value)) + Enable(cap, index, new_value); +} + +} // namespace + OpenGLState::OpenGLState() { // These all match default OpenGL values - geometry_shaders.enabled = false; framebuffer_srgb.enabled = false; + multisample_control.alpha_to_coverage = false; multisample_control.alpha_to_one = false; + cull.enabled = false; cull.mode = GL_BACK; cull.front_face = GL_CCW; @@ -30,14 +76,15 @@ OpenGLState::OpenGLState() { primitive_restart.enabled = false; primitive_restart.index = 0; + for (auto& item : color_mask) { item.red_enabled = GL_TRUE; item.green_enabled = GL_TRUE; item.blue_enabled = GL_TRUE; item.alpha_enabled = GL_TRUE; } - stencil.test_enabled = false; - auto reset_stencil = [](auto& config) { + + const auto ResetStencil = [](auto& config) { config.test_func = GL_ALWAYS; config.test_ref = 0; config.test_mask = 0xFFFFFFFF; @@ -46,8 +93,10 @@ OpenGLState::OpenGLState() { config.action_depth_pass = GL_KEEP; config.action_stencil_fail = GL_KEEP; }; - reset_stencil(stencil.front); - reset_stencil(stencil.back); + stencil.test_enabled = false; + ResetStencil(stencil.front); + ResetStencil(stencil.back); + for (auto& item : viewports) { item.x = 0; item.y = 0; @@ -61,6 +110,7 @@ OpenGLState::OpenGLState() { item.scissor.width = 0; item.scissor.height = 0; } + for (auto& item : blend) { item.enabled = true; item.rgb_equation = GL_FUNC_ADD; @@ -70,11 +120,14 @@ OpenGLState::OpenGLState() { item.src_a_func = GL_ONE; item.dst_a_func = GL_ZERO; } + independant_blend.enabled = false; + blend_color.red = 0.0f; blend_color.green = 0.0f; blend_color.blue = 0.0f; blend_color.alpha = 0.0f; + logic_op.enabled = false; logic_op.operation = GL_COPY; @@ -91,9 +144,12 @@ OpenGLState::OpenGLState() { clip_distance = {}; point.size = 1; + fragment_color_clamp.enabled = false; + depth_clamp.far_plane = false; depth_clamp.near_plane = false; + polygon_offset.fill_enable = false; polygon_offset.line_enable = false; polygon_offset.point_enable = false; @@ -103,260 +159,255 @@ OpenGLState::OpenGLState() { } void OpenGLState::ApplyDefaultState() { + glEnable(GL_BLEND); glDisable(GL_FRAMEBUFFER_SRGB); glDisable(GL_CULL_FACE); glDisable(GL_DEPTH_TEST); glDisable(GL_PRIMITIVE_RESTART); glDisable(GL_STENCIL_TEST); - glEnable(GL_BLEND); glDisable(GL_COLOR_LOGIC_OP); glDisable(GL_SCISSOR_TEST); } +void OpenGLState::ApplyFramebufferState() const { + if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) { + glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer); + } + if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) { + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer); + } +} + +void OpenGLState::ApplyVertexArrayState() const { + if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) { + glBindVertexArray(draw.vertex_array); + } +} + +void OpenGLState::ApplyShaderProgram() const { + if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) { + glUseProgram(draw.shader_program); + } +} + +void OpenGLState::ApplyProgramPipeline() const { + if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) { + glBindProgramPipeline(draw.program_pipeline); + } +} + +void OpenGLState::ApplyClipDistances() const { + for (std::size_t i = 0; i < clip_distance.size(); ++i) { + Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i], + clip_distance[i]); + } +} + +void OpenGLState::ApplyPointSize() const { + if (UpdateValue(cur_state.point.size, point.size)) { + glPointSize(point.size); + } +} + +void OpenGLState::ApplyFragmentColorClamp() const { + if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) { + glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB, + fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE); + } +} + +void OpenGLState::ApplyMultisample() const { + Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage, + multisample_control.alpha_to_coverage); + Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one, + multisample_control.alpha_to_one); +} + +void OpenGLState::ApplyDepthClamp() const { + if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane && + depth_clamp.near_plane == cur_state.depth_clamp.near_plane) { + return; + } + cur_state.depth_clamp = depth_clamp; + + UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane, + "Unimplemented Depth Clamp Separation!"); + + Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane); +} + void OpenGLState::ApplySRgb() const { - if (framebuffer_srgb.enabled != cur_state.framebuffer_srgb.enabled) { - if (framebuffer_srgb.enabled) { - // Track if sRGB is used - s_rgb_used = true; - glEnable(GL_FRAMEBUFFER_SRGB); - } else { - glDisable(GL_FRAMEBUFFER_SRGB); - } + if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled) + return; + cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled; + if (framebuffer_srgb.enabled) { + // Track if sRGB is used + s_rgb_used = true; + glEnable(GL_FRAMEBUFFER_SRGB); + } else { + glDisable(GL_FRAMEBUFFER_SRGB); } } void OpenGLState::ApplyCulling() const { - if (cull.enabled != cur_state.cull.enabled) { - if (cull.enabled) { - glEnable(GL_CULL_FACE); - } else { - glDisable(GL_CULL_FACE); - } - } + Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled); - if (cull.mode != cur_state.cull.mode) { + if (UpdateValue(cur_state.cull.mode, cull.mode)) { glCullFace(cull.mode); } - if (cull.front_face != cur_state.cull.front_face) { + if (UpdateValue(cur_state.cull.front_face, cull.front_face)) { glFrontFace(cull.front_face); } } void OpenGLState::ApplyColorMask() const { - if (independant_blend.enabled) { - for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - const auto& updated = color_mask[i]; - const auto& current = cur_state.color_mask[i]; - if (updated.red_enabled != current.red_enabled || - updated.green_enabled != current.green_enabled || - updated.blue_enabled != current.blue_enabled || - updated.alpha_enabled != current.alpha_enabled) { - glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled, - updated.blue_enabled, updated.alpha_enabled); - } - } - } else { - const auto& updated = color_mask[0]; - const auto& current = cur_state.color_mask[0]; + for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { + const auto& updated = color_mask[i]; + auto& current = cur_state.color_mask[i]; if (updated.red_enabled != current.red_enabled || updated.green_enabled != current.green_enabled || updated.blue_enabled != current.blue_enabled || updated.alpha_enabled != current.alpha_enabled) { - glColorMask(updated.red_enabled, updated.green_enabled, updated.blue_enabled, - updated.alpha_enabled); + current = updated; + glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled, + updated.blue_enabled, updated.alpha_enabled); } } } void OpenGLState::ApplyDepth() const { - if (depth.test_enabled != cur_state.depth.test_enabled) { - if (depth.test_enabled) { - glEnable(GL_DEPTH_TEST); - } else { - glDisable(GL_DEPTH_TEST); - } - } + Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled); - if (depth.test_func != cur_state.depth.test_func) { + if (cur_state.depth.test_func != depth.test_func) { + cur_state.depth.test_func = depth.test_func; glDepthFunc(depth.test_func); } - if (depth.write_mask != cur_state.depth.write_mask) { + if (cur_state.depth.write_mask != depth.write_mask) { + cur_state.depth.write_mask = depth.write_mask; glDepthMask(depth.write_mask); } } void OpenGLState::ApplyPrimitiveRestart() const { - if (primitive_restart.enabled != cur_state.primitive_restart.enabled) { - if (primitive_restart.enabled) { - glEnable(GL_PRIMITIVE_RESTART); - } else { - glDisable(GL_PRIMITIVE_RESTART); - } - } + Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled); - if (primitive_restart.index != cur_state.primitive_restart.index) { + if (cur_state.primitive_restart.index != primitive_restart.index) { + cur_state.primitive_restart.index = primitive_restart.index; glPrimitiveRestartIndex(primitive_restart.index); } } void OpenGLState::ApplyStencilTest() const { - if (stencil.test_enabled != cur_state.stencil.test_enabled) { - if (stencil.test_enabled) { - glEnable(GL_STENCIL_TEST); - } else { - glDisable(GL_STENCIL_TEST); - } - } - - const auto ConfigStencil = [](GLenum face, const auto& config, const auto& prev_config) { - if (config.test_func != prev_config.test_func || config.test_ref != prev_config.test_ref || - config.test_mask != prev_config.test_mask) { + Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled); + + const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) { + if (current.test_func != config.test_func || current.test_ref != config.test_ref || + current.test_mask != config.test_mask) { + current.test_func = config.test_func; + current.test_ref = config.test_ref; + current.test_mask = config.test_mask; glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask); } - if (config.action_depth_fail != prev_config.action_depth_fail || - config.action_depth_pass != prev_config.action_depth_pass || - config.action_stencil_fail != prev_config.action_stencil_fail) { + if (current.action_depth_fail != config.action_depth_fail || + current.action_depth_pass != config.action_depth_pass || + current.action_stencil_fail != config.action_stencil_fail) { + current.action_depth_fail = config.action_depth_fail; + current.action_depth_pass = config.action_depth_pass; + current.action_stencil_fail = config.action_stencil_fail; glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail, config.action_depth_pass); } - if (config.write_mask != prev_config.write_mask) { + if (current.write_mask != config.write_mask) { + current.write_mask = config.write_mask; glStencilMaskSeparate(face, config.write_mask); } }; ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front); ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back); } -// Viewport does not affects glClearBuffer so emulate viewport using scissor test -void OpenGLState::EmulateViewportWithScissor() { - auto& current = viewports[0]; - if (current.scissor.enabled) { - const GLint left = std::max(current.x, current.scissor.x); - const GLint right = - std::max(current.x + current.width, current.scissor.x + current.scissor.width); - const GLint bottom = std::max(current.y, current.scissor.y); - const GLint top = - std::max(current.y + current.height, current.scissor.y + current.scissor.height); - current.scissor.x = std::max(left, 0); - current.scissor.y = std::max(bottom, 0); - current.scissor.width = std::max(right - left, 0); - current.scissor.height = std::max(top - bottom, 0); - } else { - current.scissor.enabled = true; - current.scissor.x = current.x; - current.scissor.y = current.y; - current.scissor.width = current.width; - current.scissor.height = current.height; - } -} void OpenGLState::ApplyViewport() const { - if (geometry_shaders.enabled) { - for (GLuint i = 0; i < static_cast<GLuint>(Tegra::Engines::Maxwell3D::Regs::NumViewports); - i++) { - const auto& current = cur_state.viewports[i]; - const auto& updated = viewports[i]; - if (updated.x != current.x || updated.y != current.y || - updated.width != current.width || updated.height != current.height) { - glViewportIndexedf( - i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y), - static_cast<GLfloat>(updated.width), static_cast<GLfloat>(updated.height)); - } - if (updated.depth_range_near != current.depth_range_near || - updated.depth_range_far != current.depth_range_far) { - glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far); - } - - if (updated.scissor.enabled != current.scissor.enabled) { - if (updated.scissor.enabled) { - glEnablei(GL_SCISSOR_TEST, i); - } else { - glDisablei(GL_SCISSOR_TEST, i); - } - } - - if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y || - updated.scissor.width != current.scissor.width || - updated.scissor.height != current.scissor.height) { - glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width, - updated.scissor.height); - } - } - } else { - const auto& current = cur_state.viewports[0]; - const auto& updated = viewports[0]; - if (updated.x != current.x || updated.y != current.y || updated.width != current.width || - updated.height != current.height) { - glViewport(updated.x, updated.y, updated.width, updated.height); - } - - if (updated.depth_range_near != current.depth_range_near || - updated.depth_range_far != current.depth_range_far) { - glDepthRange(updated.depth_range_near, updated.depth_range_far); + for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) { + const auto& updated = viewports[i]; + auto& current = cur_state.viewports[i]; + + if (current.x != updated.x || current.y != updated.y || current.width != updated.width || + current.height != updated.height) { + current.x = updated.x; + current.y = updated.y; + current.width = updated.width; + current.height = updated.height; + glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y), + static_cast<GLfloat>(updated.width), + static_cast<GLfloat>(updated.height)); } - - if (updated.scissor.enabled != current.scissor.enabled) { - if (updated.scissor.enabled) { - glEnable(GL_SCISSOR_TEST); - } else { - glDisable(GL_SCISSOR_TEST); - } + if (current.depth_range_near != updated.depth_range_near || + current.depth_range_far != updated.depth_range_far) { + current.depth_range_near = updated.depth_range_near; + current.depth_range_far = updated.depth_range_far; + glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far); } - if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y || - updated.scissor.width != current.scissor.width || - updated.scissor.height != current.scissor.height) { - glScissor(updated.scissor.x, updated.scissor.y, updated.scissor.width, - updated.scissor.height); + Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled); + + if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y || + current.scissor.width != updated.scissor.width || + current.scissor.height != updated.scissor.height) { + current.scissor.x = updated.scissor.x; + current.scissor.y = updated.scissor.y; + current.scissor.width = updated.scissor.width; + current.scissor.height = updated.scissor.height; + glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width, + updated.scissor.height); } } } void OpenGLState::ApplyGlobalBlending() const { - const Blend& current = cur_state.blend[0]; const Blend& updated = blend[0]; - if (updated.enabled != current.enabled) { - if (updated.enabled) { - glEnable(GL_BLEND); - } else { - glDisable(GL_BLEND); - } - } - if (!updated.enabled) { - return; - } - if (updated.src_rgb_func != current.src_rgb_func || - updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func || - updated.dst_a_func != current.dst_a_func) { + Blend& current = cur_state.blend[0]; + + Enable(GL_BLEND, current.enabled, updated.enabled); + + if (current.src_rgb_func != updated.src_rgb_func || + current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func || + current.dst_a_func != updated.dst_a_func) { + current.src_rgb_func = updated.src_rgb_func; + current.dst_rgb_func = updated.dst_rgb_func; + current.src_a_func = updated.src_a_func; + current.dst_a_func = updated.dst_a_func; glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func); } - if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) { + if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) { + current.rgb_equation = updated.rgb_equation; + current.a_equation = updated.a_equation; glBlendEquationSeparate(updated.rgb_equation, updated.a_equation); } } void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const { const Blend& updated = blend[target]; - const Blend& current = cur_state.blend[target]; - if (updated.enabled != current.enabled || force) { - if (updated.enabled) { - glEnablei(GL_BLEND, static_cast<GLuint>(target)); - } else { - glDisablei(GL_BLEND, static_cast<GLuint>(target)); - } + Blend& current = cur_state.blend[target]; + + if (current.enabled != updated.enabled || force) { + current.enabled = updated.enabled; + Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled); } - if (updated.src_rgb_func != current.src_rgb_func || - updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func || - updated.dst_a_func != current.dst_a_func) { + if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func, + current.dst_a_func), + std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, + updated.dst_a_func))) { glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func); } - if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) { + if (UpdateTie(std::tie(current.rgb_equation, current.a_equation), + std::tie(updated.rgb_equation, updated.a_equation))) { glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation, updated.a_equation); } @@ -364,77 +415,48 @@ void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const { void OpenGLState::ApplyBlending() const { if (independant_blend.enabled) { - for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - ApplyTargetBlending(i, - independant_blend.enabled != cur_state.independant_blend.enabled); + const bool force = independant_blend.enabled != cur_state.independant_blend.enabled; + for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) { + ApplyTargetBlending(target, force); } } else { ApplyGlobalBlending(); } - if (blend_color.red != cur_state.blend_color.red || - blend_color.green != cur_state.blend_color.green || - blend_color.blue != cur_state.blend_color.blue || - blend_color.alpha != cur_state.blend_color.alpha) { + cur_state.independant_blend.enabled = independant_blend.enabled; + + if (UpdateTie( + std::tie(cur_state.blend_color.red, cur_state.blend_color.green, + cur_state.blend_color.blue, cur_state.blend_color.alpha), + std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) { glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha); } } void OpenGLState::ApplyLogicOp() const { - if (logic_op.enabled != cur_state.logic_op.enabled) { - if (logic_op.enabled) { - glEnable(GL_COLOR_LOGIC_OP); - } else { - glDisable(GL_COLOR_LOGIC_OP); - } - } + Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled); - if (logic_op.operation != cur_state.logic_op.operation) { + if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) { glLogicOp(logic_op.operation); } } void OpenGLState::ApplyPolygonOffset() const { - const bool fill_enable_changed = - polygon_offset.fill_enable != cur_state.polygon_offset.fill_enable; - const bool line_enable_changed = - polygon_offset.line_enable != cur_state.polygon_offset.line_enable; - const bool point_enable_changed = - polygon_offset.point_enable != cur_state.polygon_offset.point_enable; - const bool factor_changed = polygon_offset.factor != cur_state.polygon_offset.factor; - const bool units_changed = polygon_offset.units != cur_state.polygon_offset.units; - const bool clamp_changed = polygon_offset.clamp != cur_state.polygon_offset.clamp; - - if (fill_enable_changed) { - if (polygon_offset.fill_enable) { - glEnable(GL_POLYGON_OFFSET_FILL); - } else { - glDisable(GL_POLYGON_OFFSET_FILL); - } - } - - if (line_enable_changed) { - if (polygon_offset.line_enable) { - glEnable(GL_POLYGON_OFFSET_LINE); - } else { - glDisable(GL_POLYGON_OFFSET_LINE); - } - } - - if (point_enable_changed) { - if (polygon_offset.point_enable) { - glEnable(GL_POLYGON_OFFSET_POINT); - } else { - glDisable(GL_POLYGON_OFFSET_POINT); - } - } - - if (factor_changed || units_changed || clamp_changed) { + Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable, + polygon_offset.fill_enable); + Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable, + polygon_offset.line_enable); + Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable, + polygon_offset.point_enable); + + if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units, + cur_state.polygon_offset.clamp), + std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) { if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) { glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp); } else { - glPolygonOffset(polygon_offset.factor, polygon_offset.units); UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0, "Unimplemented Depth polygon offset clamp."); + glPolygonOffset(polygon_offset.factor, polygon_offset.units); } } } @@ -443,22 +465,21 @@ void OpenGLState::ApplyTextures() const { bool has_delta{}; std::size_t first{}; std::size_t last{}; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures; + std::array<GLuint, Maxwell::NumTextureSamplers> textures; for (std::size_t i = 0; i < std::size(texture_units); ++i) { const auto& texture_unit = texture_units[i]; - const auto& cur_state_texture_unit = cur_state.texture_units[i]; + auto& cur_state_texture_unit = cur_state.texture_units[i]; textures[i] = texture_unit.texture; - - if (textures[i] != cur_state_texture_unit.texture) { - if (!has_delta) { - first = i; - has_delta = true; - } - last = i; + if (cur_state_texture_unit.texture == textures[i]) + continue; + cur_state_texture_unit.texture = textures[i]; + if (!has_delta) { + first = i; + has_delta = true; } + last = i; } - if (has_delta) { glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), textures.data() + first); @@ -469,16 +490,18 @@ void OpenGLState::ApplySamplers() const { bool has_delta{}; std::size_t first{}; std::size_t last{}; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers; + std::array<GLuint, Maxwell::NumTextureSamplers> samplers; + for (std::size_t i = 0; i < std::size(samplers); ++i) { + if (cur_state.texture_units[i].sampler == texture_units[i].sampler) + continue; + cur_state.texture_units[i].sampler = texture_units[i].sampler; samplers[i] = texture_units[i].sampler; - if (samplers[i] != cur_state.texture_units[i].sampler) { - if (!has_delta) { - first = i; - has_delta = true; - } - last = i; + if (!has_delta) { + first = i; + has_delta = true; } + last = i; } if (has_delta) { glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1), @@ -486,81 +509,15 @@ void OpenGLState::ApplySamplers() const { } } -void OpenGLState::ApplyFramebufferState() const { - if (draw.read_framebuffer != cur_state.draw.read_framebuffer) { - glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer); - } - if (draw.draw_framebuffer != cur_state.draw.draw_framebuffer) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer); - } -} - -void OpenGLState::ApplyVertexArrayState() const { - if (draw.vertex_array != cur_state.draw.vertex_array) { - glBindVertexArray(draw.vertex_array); - } -} - -void OpenGLState::ApplyDepthClamp() const { - if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane && - depth_clamp.near_plane == cur_state.depth_clamp.near_plane) { - return; - } - UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane, - "Unimplemented Depth Clamp Separation!"); - - if (depth_clamp.far_plane || depth_clamp.near_plane) { - glEnable(GL_DEPTH_CLAMP); - } else { - glDisable(GL_DEPTH_CLAMP); - } -} - void OpenGLState::Apply() const { ApplyFramebufferState(); ApplyVertexArrayState(); - - // Shader program - if (draw.shader_program != cur_state.draw.shader_program) { - glUseProgram(draw.shader_program); - } - - // Program pipeline - if (draw.program_pipeline != cur_state.draw.program_pipeline) { - glBindProgramPipeline(draw.program_pipeline); - } - // Clip distance - for (std::size_t i = 0; i < clip_distance.size(); ++i) { - if (clip_distance[i] != cur_state.clip_distance[i]) { - if (clip_distance[i]) { - glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i)); - } else { - glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i)); - } - } - } - // Point - if (point.size != cur_state.point.size) { - glPointSize(point.size); - } - if (fragment_color_clamp.enabled != cur_state.fragment_color_clamp.enabled) { - glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB, - fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE); - } - if (multisample_control.alpha_to_coverage != cur_state.multisample_control.alpha_to_coverage) { - if (multisample_control.alpha_to_coverage) { - glEnable(GL_SAMPLE_ALPHA_TO_COVERAGE); - } else { - glDisable(GL_SAMPLE_ALPHA_TO_COVERAGE); - } - } - if (multisample_control.alpha_to_one != cur_state.multisample_control.alpha_to_one) { - if (multisample_control.alpha_to_one) { - glEnable(GL_SAMPLE_ALPHA_TO_ONE); - } else { - glDisable(GL_SAMPLE_ALPHA_TO_ONE); - } - } + ApplyShaderProgram(); + ApplyProgramPipeline(); + ApplyClipDistances(); + ApplyPointSize(); + ApplyFragmentColorClamp(); + ApplyMultisample(); ApplyDepthClamp(); ApplyColorMask(); ApplyViewport(); @@ -574,7 +531,28 @@ void OpenGLState::Apply() const { ApplyTextures(); ApplySamplers(); ApplyPolygonOffset(); - cur_state = *this; +} + +void OpenGLState::EmulateViewportWithScissor() { + auto& current = viewports[0]; + if (current.scissor.enabled) { + const GLint left = std::max(current.x, current.scissor.x); + const GLint right = + std::max(current.x + current.width, current.scissor.x + current.scissor.width); + const GLint bottom = std::max(current.y, current.scissor.y); + const GLint top = + std::max(current.y + current.height, current.scissor.y + current.scissor.height); + current.scissor.x = std::max(left, 0); + current.scissor.y = std::max(bottom, 0); + current.scissor.width = std::max(right - left, 0); + current.scissor.height = std::max(top - bottom, 0); + } else { + current.scissor.enabled = true; + current.scissor.x = current.x; + current.scissor.y = current.y; + current.scissor.width = current.width; + current.scissor.height = current.height; + } } OpenGLState& OpenGLState::UnbindTexture(GLuint handle) { diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 9e1eda5b1..41418a7b8 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -54,10 +54,6 @@ public: } depth_clamp; // GL_DEPTH_CLAMP struct { - bool enabled; // viewports arrays are only supported when geometry shaders are enabled. - } geometry_shaders; - - struct { bool enabled; // GL_CULL_FACE GLenum mode; // GL_CULL_FACE_MODE GLenum front_face; // GL_FRONT_FACE @@ -184,34 +180,26 @@ public: static OpenGLState GetCurState() { return cur_state; } + static bool GetsRGBUsed() { return s_rgb_used; } + static void ClearsRGBUsed() { s_rgb_used = false; } + /// Apply this state as the current OpenGL state void Apply() const; - /// Apply only the state affecting the framebuffer + void ApplyFramebufferState() const; - /// Apply only the state affecting the vertex array void ApplyVertexArrayState() const; - /// Set the initial OpenGL state - static void ApplyDefaultState(); - /// Resets any references to the given resource - OpenGLState& UnbindTexture(GLuint handle); - OpenGLState& ResetSampler(GLuint handle); - OpenGLState& ResetProgram(GLuint handle); - OpenGLState& ResetPipeline(GLuint handle); - OpenGLState& ResetVertexArray(GLuint handle); - OpenGLState& ResetFramebuffer(GLuint handle); - void EmulateViewportWithScissor(); - -private: - static OpenGLState cur_state; - // Workaround for sRGB problems caused by - // QT not supporting srgb output - static bool s_rgb_used; + void ApplyShaderProgram() const; + void ApplyProgramPipeline() const; + void ApplyClipDistances() const; + void ApplyPointSize() const; + void ApplyFragmentColorClamp() const; + void ApplyMultisample() const; void ApplySRgb() const; void ApplyCulling() const; void ApplyColorMask() const; @@ -227,6 +215,26 @@ private: void ApplySamplers() const; void ApplyDepthClamp() const; void ApplyPolygonOffset() const; + + /// Set the initial OpenGL state + static void ApplyDefaultState(); + + /// Resets any references to the given resource + OpenGLState& UnbindTexture(GLuint handle); + OpenGLState& ResetSampler(GLuint handle); + OpenGLState& ResetProgram(GLuint handle); + OpenGLState& ResetPipeline(GLuint handle); + OpenGLState& ResetVertexArray(GLuint handle); + OpenGLState& ResetFramebuffer(GLuint handle); + + /// Viewport does not affects glClearBuffer so emulate viewport using scissor test + void EmulateViewportWithScissor(); + +private: + static OpenGLState cur_state; + + // Workaround for sRGB problems caused by QT not supporting srgb output + static bool s_rgb_used; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 5e3d862c6..d69cba9c3 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,7 +5,6 @@ #include <algorithm> #include <cstddef> #include <cstdlib> -#include <cstring> #include <memory> #include <glad/glad.h> #include "common/assert.h" @@ -266,7 +265,7 @@ void RendererOpenGL::CreateRasterizer() { } // Initialize sRGB Usage OpenGLState::ClearsRGBUsed(); - rasterizer = std::make_unique<RasterizerOpenGL>(render_window, system, screen_info); + rasterizer = std::make_unique<RasterizerOpenGL>(system, screen_info); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index d84634cb3..84a987371 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -5,11 +5,39 @@ #include <string> #include <fmt/format.h> #include <glad/glad.h> +#include "common/assert.h" #include "common/common_types.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { +BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} + +BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; + +void BindBuffersRangePushBuffer::Setup(GLuint first_) { + first = first_; + buffers.clear(); + offsets.clear(); + sizes.clear(); +} + +void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) { + buffers.push_back(buffer); + offsets.push_back(offset); + sizes.push_back(size); +} + +void BindBuffersRangePushBuffer::Bind() const { + const std::size_t count{buffers.size()}; + DEBUG_ASSERT(count == offsets.size() && count == sizes.size()); + if (count == 0) { + return; + } + glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(), + sizes.data()); +} + void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info) { if (!GLAD_GL_KHR_debug) { return; // We don't need to throw an error as this is just for debugging diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 1fcb6fc11..aef45c9dc 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -5,11 +5,31 @@ #pragma once #include <string> +#include <vector> #include <glad/glad.h> #include "common/common_types.h" namespace OpenGL { +class BindBuffersRangePushBuffer { +public: + BindBuffersRangePushBuffer(GLenum target); + ~BindBuffersRangePushBuffer(); + + void Setup(GLuint first_); + + void Push(GLuint buffer, GLintptr offset, GLsizeiptr size); + + void Bind() const; + +private: + GLenum target; + GLuint first; + std::vector<GLuint> buffers; + std::vector<GLintptr> offsets; + std::vector<GLsizeiptr> sizes; +}; + void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info = ""); } // namespace OpenGL
\ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 95eab3fec..02a9f5ecb 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -10,6 +10,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "core/memory.h" +#include "video_core/memory_manager.h" #include "video_core/renderer_vulkan/declarations.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -19,8 +20,8 @@ namespace Vulkan { CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, std::size_t alignment, u8* host_ptr) - : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{ - host_ptr} {} + : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset}, + alignment{alignment} {} VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, @@ -39,8 +40,7 @@ VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager, VKBufferCache::~VKBufferCache() = default; -u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment, - bool cache) { +u64 VKBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, u64 alignment, bool cache) { const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)}; ASSERT_MSG(cpu_addr, "Invalid GPU address"); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 8b415744b..08b786aad 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -68,8 +68,7 @@ public: /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been /// allocated. - u64 UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment = 4, - bool cache = true); + u64 UploadMemory(GPUVAddr gpu_addr, std::size_t size, u64 alignment = 4, bool cache = true); /// Uploads from a host memory. Returns host's buffer offset where it's been allocated. u64 UploadHostMemory(const u8* raw_pointer, std::size_t size, u64 alignment = 4); diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp index a1e117443..13c46e5b8 100644 --- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_resource_manager.cpp @@ -21,7 +21,7 @@ public: CommandBufferPool(const VKDevice& device) : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {} - void Allocate(std::size_t begin, std::size_t end) { + void Allocate(std::size_t begin, std::size_t end) override { const auto dev = device.GetLogical(); const auto& dld = device.GetDispatchLoader(); const u32 graphics_family = device.GetGraphicsFamily(); diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h index 5bfe4cead..08ee86fa6 100644 --- a/src/video_core/renderer_vulkan/vk_resource_manager.h +++ b/src/video_core/renderer_vulkan/vk_resource_manager.h @@ -97,7 +97,7 @@ private: class VKFenceWatch final : public VKResource { public: explicit VKFenceWatch(); - ~VKFenceWatch(); + ~VKFenceWatch() override; /// Waits for the fence to be released. void Wait(); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp new file mode 100644 index 000000000..08279e562 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -0,0 +1,210 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <limits> +#include <vector> + +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/core.h" +#include "core/frontend/framebuffer_layout.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" + +namespace Vulkan { + +namespace { +vk::SurfaceFormatKHR ChooseSwapSurfaceFormat(const std::vector<vk::SurfaceFormatKHR>& formats) { + if (formats.size() == 1 && formats[0].format == vk::Format::eUndefined) { + return {vk::Format::eB8G8R8A8Unorm, vk::ColorSpaceKHR::eSrgbNonlinear}; + } + const auto& found = std::find_if(formats.begin(), formats.end(), [](const auto& format) { + return format.format == vk::Format::eB8G8R8A8Unorm && + format.colorSpace == vk::ColorSpaceKHR::eSrgbNonlinear; + }); + return found != formats.end() ? *found : formats[0]; +} + +vk::PresentModeKHR ChooseSwapPresentMode(const std::vector<vk::PresentModeKHR>& modes) { + // Mailbox doesn't lock the application like fifo (vsync), prefer it + const auto& found = std::find_if(modes.begin(), modes.end(), [](const auto& mode) { + return mode == vk::PresentModeKHR::eMailbox; + }); + return found != modes.end() ? *found : vk::PresentModeKHR::eFifo; +} + +vk::Extent2D ChooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width, + u32 height) { + constexpr auto undefined_size{std::numeric_limits<u32>::max()}; + if (capabilities.currentExtent.width != undefined_size) { + return capabilities.currentExtent; + } + vk::Extent2D extent = {width, height}; + extent.width = std::max(capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, extent.width)); + extent.height = std::max(capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, extent.height)); + return extent; +} +} // namespace + +VKSwapchain::VKSwapchain(vk::SurfaceKHR surface, const VKDevice& device) + : surface{surface}, device{device} {} + +VKSwapchain::~VKSwapchain() = default; + +void VKSwapchain::Create(u32 width, u32 height) { + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + const auto physical_device = device.GetPhysical(); + + const vk::SurfaceCapabilitiesKHR capabilities{ + physical_device.getSurfaceCapabilitiesKHR(surface, dld)}; + if (capabilities.maxImageExtent.width == 0 || capabilities.maxImageExtent.height == 0) { + return; + } + + dev.waitIdle(dld); + Destroy(); + + CreateSwapchain(capabilities, width, height); + CreateSemaphores(); + CreateImageViews(); + + fences.resize(image_count, nullptr); +} + +void VKSwapchain::AcquireNextImage() { + const auto dev{device.GetLogical()}; + const auto& dld{device.GetDispatchLoader()}; + dev.acquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(), + *present_semaphores[frame_index], {}, &image_index, dld); + + if (auto& fence = fences[image_index]; fence) { + fence->Wait(); + fence->Release(); + fence = nullptr; + } +} + +bool VKSwapchain::Present(vk::Semaphore render_semaphore, VKFence& fence) { + const vk::Semaphore present_semaphore{*present_semaphores[frame_index]}; + const std::array<vk::Semaphore, 2> semaphores{present_semaphore, render_semaphore}; + const u32 wait_semaphore_count{render_semaphore ? 2U : 1U}; + const auto& dld{device.GetDispatchLoader()}; + const auto present_queue{device.GetPresentQueue()}; + bool recreated = false; + + const vk::PresentInfoKHR present_info(wait_semaphore_count, semaphores.data(), 1, + &swapchain.get(), &image_index, {}); + switch (const auto result = present_queue.presentKHR(&present_info, dld); result) { + case vk::Result::eSuccess: + break; + case vk::Result::eErrorOutOfDateKHR: + if (current_width > 0 && current_height > 0) { + Create(current_width, current_height); + recreated = true; + } + break; + default: + LOG_CRITICAL(Render_Vulkan, "Vulkan failed to present swapchain due to {}!", + vk::to_string(result)); + UNREACHABLE(); + } + + ASSERT(fences[image_index] == nullptr); + fences[image_index] = &fence; + frame_index = (frame_index + 1) % image_count; + return recreated; +} + +bool VKSwapchain::HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const { + // TODO(Rodrigo): Handle framebuffer pixel format changes + return framebuffer.width != current_width || framebuffer.height != current_height; +} + +void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width, + u32 height) { + const auto dev{device.GetLogical()}; + const auto& dld{device.GetDispatchLoader()}; + const auto physical_device{device.GetPhysical()}; + + const std::vector<vk::SurfaceFormatKHR> formats{ + physical_device.getSurfaceFormatsKHR(surface, dld)}; + + const std::vector<vk::PresentModeKHR> present_modes{ + physical_device.getSurfacePresentModesKHR(surface, dld)}; + + const vk::SurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)}; + const vk::PresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)}; + extent = ChooseSwapExtent(capabilities, width, height); + + current_width = extent.width; + current_height = extent.height; + + u32 requested_image_count{capabilities.minImageCount + 1}; + if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) { + requested_image_count = capabilities.maxImageCount; + } + + vk::SwapchainCreateInfoKHR swapchain_ci( + {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, + extent, 1, vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, + capabilities.currentTransform, vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, + {}); + + const u32 graphics_family{device.GetGraphicsFamily()}; + const u32 present_family{device.GetPresentFamily()}; + const std::array<u32, 2> queue_indices{graphics_family, present_family}; + if (graphics_family != present_family) { + swapchain_ci.imageSharingMode = vk::SharingMode::eConcurrent; + swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size()); + swapchain_ci.pQueueFamilyIndices = queue_indices.data(); + } else { + swapchain_ci.imageSharingMode = vk::SharingMode::eExclusive; + } + + swapchain = dev.createSwapchainKHRUnique(swapchain_ci, nullptr, dld); + + images = dev.getSwapchainImagesKHR(*swapchain, dld); + image_count = static_cast<u32>(images.size()); + image_format = surface_format.format; +} + +void VKSwapchain::CreateSemaphores() { + const auto dev{device.GetLogical()}; + const auto& dld{device.GetDispatchLoader()}; + + present_semaphores.resize(image_count); + for (std::size_t i = 0; i < image_count; i++) { + present_semaphores[i] = dev.createSemaphoreUnique({}, nullptr, dld); + } +} + +void VKSwapchain::CreateImageViews() { + const auto dev{device.GetLogical()}; + const auto& dld{device.GetDispatchLoader()}; + + image_views.resize(image_count); + for (std::size_t i = 0; i < image_count; i++) { + const vk::ImageViewCreateInfo image_view_ci({}, images[i], vk::ImageViewType::e2D, + image_format, {}, + {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}); + image_views[i] = dev.createImageViewUnique(image_view_ci, nullptr, dld); + } +} + +void VKSwapchain::Destroy() { + frame_index = 0; + present_semaphores.clear(); + framebuffers.clear(); + image_views.clear(); + swapchain.reset(); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h new file mode 100644 index 000000000..2ad84f185 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -0,0 +1,92 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include "common/common_types.h" +#include "video_core/renderer_vulkan/declarations.h" + +namespace Layout { +struct FramebufferLayout; +} + +namespace Vulkan { + +class VKDevice; +class VKFence; + +class VKSwapchain { +public: + explicit VKSwapchain(vk::SurfaceKHR surface, const VKDevice& device); + ~VKSwapchain(); + + /// Creates (or recreates) the swapchain with a given size. + void Create(u32 width, u32 height); + + /// Acquires the next image in the swapchain, waits as needed. + void AcquireNextImage(); + + /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be + /// recreated. Takes responsability for the ownership of fence. + bool Present(vk::Semaphore render_semaphore, VKFence& fence); + + /// Returns true when the framebuffer layout has changed. + bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const; + + const vk::Extent2D& GetSize() const { + return extent; + } + + u32 GetImageCount() const { + return image_count; + } + + u32 GetImageIndex() const { + return image_index; + } + + vk::Image GetImageIndex(u32 index) const { + return images[index]; + } + + vk::ImageView GetImageViewIndex(u32 index) const { + return *image_views[index]; + } + + vk::Format GetImageFormat() const { + return image_format; + } + +private: + void CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities, u32 width, u32 height); + void CreateSemaphores(); + void CreateImageViews(); + + void Destroy(); + + const vk::SurfaceKHR surface; + const VKDevice& device; + + UniqueSwapchainKHR swapchain; + + u32 image_count{}; + std::vector<vk::Image> images; + std::vector<UniqueImageView> image_views; + std::vector<UniqueFramebuffer> framebuffers; + std::vector<VKFence*> fences; + std::vector<UniqueSemaphore> present_semaphores; + + u32 image_index{}; + u32 frame_index{}; + + vk::Format image_format{}; + vk::Extent2D extent{}; + + u32 current_width{}; + u32 current_height{}; +}; + +} // namespace Vulkan diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index a99ae19bf..a775b402b 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -7,7 +7,9 @@ #include <fmt/format.h> #include "common/assert.h" +#include "common/bit_field.h" #include "common/common_types.h" +#include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/shader_ir.h" @@ -41,19 +43,18 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { switch (opcode->get().GetId()) { case OpCode::Id::TEX: { - UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI), - "AOFFI is not implemented"); - if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) { LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete"); } const TextureType texture_type{instr.tex.texture_type}; const bool is_array = instr.tex.array != 0; + const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI); const bool depth_compare = instr.tex.UsesMiscMode(TextureMiscMode::DC); const auto process_mode = instr.tex.GetTextureProcessMode(); WriteTexInstructionFloat( - bb, instr, GetTexCode(instr, texture_type, process_mode, depth_compare, is_array)); + bb, instr, + GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi)); break; } case OpCode::Id::TEXS: { @@ -78,8 +79,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { } case OpCode::Id::TLD4: { ASSERT(instr.tld4.array == 0); - UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI), - "AOFFI is not implemented"); UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::NDV), "NDV is not implemented"); UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP), @@ -92,8 +91,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { const auto texture_type = instr.tld4.texture_type.Value(); const bool depth_compare = instr.tld4.UsesMiscMode(TextureMiscMode::DC); const bool is_array = instr.tld4.array != 0; - WriteTexInstructionFloat(bb, instr, - GetTld4Code(instr, texture_type, depth_compare, is_array)); + const bool is_aoffi = instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI); + WriteTexInstructionFloat( + bb, instr, GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi)); break; } case OpCode::Id::TLD4S: { @@ -127,7 +127,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, {}, {}, {}, {}, component, element}; + MetaTexture meta{sampler, {}, {}, {}, {}, {}, component, element}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } @@ -152,7 +152,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { if (!instr.txq.IsComponentEnabled(element)) { continue; } - MetaTexture meta{sampler, {}, {}, {}, {}, {}, element}; + MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element}; const Node value = Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8)); SetTemporal(bb, indexer++, value); @@ -202,7 +202,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { for (u32 element = 0; element < 2; ++element) { auto params = coords; - MetaTexture meta{sampler, {}, {}, {}, {}, {}, element}; + MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element}; const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); SetTemporal(bb, element, value); } @@ -325,7 +325,8 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr, Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, TextureProcessMode process_mode, std::vector<Node> coords, - Node array, Node depth_compare, u32 bias_offset) { + Node array, Node depth_compare, u32 bias_offset, + std::vector<Node> aoffi) { const bool is_array = array; const bool is_shadow = depth_compare; @@ -374,7 +375,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto copy_coords = coords; - MetaTexture meta{sampler, array, depth_compare, bias, lod, {}, element}; + MetaTexture meta{sampler, array, depth_compare, aoffi, bias, lod, {}, element}; values[element] = Operation(read_method, meta, std::move(copy_coords)); } @@ -382,9 +383,15 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, } Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type, - TextureProcessMode process_mode, bool depth_compare, bool is_array) { - const bool lod_bias_enabled = - (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ); + TextureProcessMode process_mode, bool depth_compare, bool is_array, + bool is_aoffi) { + const bool lod_bias_enabled{ + (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ)}; + + u64 parameter_register = instr.gpr20.Value(); + if (lod_bias_enabled) { + ++parameter_register; + } const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement( texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5); @@ -404,15 +411,19 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type, const Node array = is_array ? GetRegister(array_register) : nullptr; + std::vector<Node> aoffi; + if (is_aoffi) { + aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false); + } + Node dc{}; if (depth_compare) { // Depth is always stored in the register signaled by gpr20 or in the next register if lod // or bias are used - const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0); - dc = GetRegister(depth_register); + dc = GetRegister(parameter_register++); } - return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0); + return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0, aoffi); } Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type, @@ -448,11 +459,11 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type, dc = GetRegister(depth_register); } - return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset); + return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {}); } Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare, - bool is_array) { + bool is_array, bool is_aoffi) { const std::size_t coord_count = GetCoordCount(texture_type); const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0); const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0); @@ -463,15 +474,27 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de const u64 coord_register = array_register + (is_array ? 1 : 0); std::vector<Node> coords; - for (size_t i = 0; i < coord_count; ++i) + for (std::size_t i = 0; i < coord_count; ++i) { coords.push_back(GetRegister(coord_register + i)); + } + + u64 parameter_register = instr.gpr20.Value(); + std::vector<Node> aoffi; + if (is_aoffi) { + aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, true); + } + + Node dc{}; + if (depth_compare) { + dc = GetRegister(parameter_register++); + } const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, depth_compare); Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, GetRegister(array_register), {}, {}, {}, {}, element}; + MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, element}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } @@ -507,7 +530,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array, {}, {}, lod, {}, element}; + MetaTexture meta{sampler, array, {}, {}, {}, lod, {}, element}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } return values; @@ -531,4 +554,45 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement( return {coord_count, total_coord_count}; } -} // namespace VideoCommon::Shader
\ No newline at end of file +std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, + bool is_tld4) { + const auto [coord_offsets, size, wrap_value, + diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> { + if (is_tld4) { + return {{0, 8, 16}, 6, 32, 64}; + } else { + return {{0, 4, 8}, 4, 8, 16}; + } + }(); + const u32 mask = (1U << size) - 1; + + std::vector<Node> aoffi; + aoffi.reserve(coord_count); + + const auto aoffi_immediate{ + TrackImmediate(aoffi_reg, global_code, static_cast<s64>(global_code.size()))}; + if (!aoffi_immediate) { + // Variable access, not supported on AMD. + LOG_WARNING(HW_GPU, + "AOFFI constant folding failed, some hardware might have graphical issues"); + for (std::size_t coord = 0; coord < coord_count; ++coord) { + const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size); + const Node condition = + Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value)); + const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value)); + aoffi.push_back(Operation(OperationCode::Select, condition, negative, value)); + } + return aoffi; + } + + for (std::size_t coord = 0; coord < coord_count; ++coord) { + s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask; + if (value >= wrap_value) { + value -= diff_value; + } + aoffi.push_back(Immediate(value)); + } + return aoffi; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index c34843307..db15c0718 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -29,39 +29,55 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { const bool is_signed_b = instr.xmad.sign_b == 1; const bool is_signed_c = is_signed_a; - auto [is_merge, op_b, op_c] = [&]() -> std::tuple<bool, Node, Node> { + auto [is_merge, is_psl, is_high_b, mode, op_b, + op_c] = [&]() -> std::tuple<bool, bool, bool, Tegra::Shader::XmadMode, Node, Node> { switch (opcode->get().GetId()) { case OpCode::Id::XMAD_CR: return {instr.xmad.merge_56, + instr.xmad.product_shift_left_second, + instr.xmad.high_b, + instr.xmad.mode_cbf, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()), GetRegister(instr.gpr39)}; case OpCode::Id::XMAD_RR: - return {instr.xmad.merge_37, GetRegister(instr.gpr20), GetRegister(instr.gpr39)}; + return {instr.xmad.merge_37, instr.xmad.product_shift_left, instr.xmad.high_b_rr, + instr.xmad.mode, GetRegister(instr.gpr20), GetRegister(instr.gpr39)}; case OpCode::Id::XMAD_RC: - return {false, GetRegister(instr.gpr39), + return {false, + false, + instr.xmad.high_b, + instr.xmad.mode_cbf, + GetRegister(instr.gpr39), GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; case OpCode::Id::XMAD_IMM: - return {instr.xmad.merge_37, Immediate(static_cast<u32>(instr.xmad.imm20_16)), + return {instr.xmad.merge_37, + instr.xmad.product_shift_left, + false, + instr.xmad.mode, + Immediate(static_cast<u32>(instr.xmad.imm20_16)), GetRegister(instr.gpr39)}; } UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName()); - return {false, Immediate(0), Immediate(0)}; + return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)}; }(); op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16); const Node original_b = op_b; - op_b = BitfieldExtract(op_b, instr.xmad.high_b ? 16 : 0, 16); + op_b = BitfieldExtract(op_b, is_high_b ? 16 : 0, 16); // TODO(Rodrigo): Use an appropiate sign for this operation Node product = Operation(OperationCode::IMul, NO_PRECISE, op_a, op_b); - if (instr.xmad.product_shift_left) { + if (is_psl) { product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16)); } + SetTemporal(bb, 0, product); + product = GetTemporal(0); const Node original_c = op_c; + const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error op_c = [&]() { - switch (instr.xmad.mode) { + switch (set_mode) { case Tegra::Shader::XmadMode::None: return original_c; case Tegra::Shader::XmadMode::CLo: @@ -80,8 +96,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { } }(); + SetTemporal(bb, 1, op_c); + op_c = GetTemporal(1); + // TODO(Rodrigo): Use an appropiate sign for this operation Node sum = Operation(OperationCode::IAdd, product, op_c); + SetTemporal(bb, 2, sum); + sum = GetTemporal(2); if (is_merge) { const Node a = BitfieldExtract(sum, 0, 16); const Node b = @@ -95,4 +116,4 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { return pc; } -} // namespace VideoCommon::Shader
\ No newline at end of file +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 5bc3a3900..4888998d3 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -7,6 +7,7 @@ #include <array> #include <cstring> #include <map> +#include <optional> #include <set> #include <string> #include <tuple> @@ -290,6 +291,7 @@ struct MetaTexture { const Sampler& sampler; Node array{}; Node depth_compare{}; + std::vector<Node> aoffi; Node bias{}; Node lod{}; Node component{}; @@ -741,14 +743,14 @@ private: Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type, Tegra::Shader::TextureProcessMode process_mode, bool depth_compare, - bool is_array); + bool is_array, bool is_aoffi); Node4 GetTexsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type, Tegra::Shader::TextureProcessMode process_mode, bool depth_compare, bool is_array); Node4 GetTld4Code(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type, - bool depth_compare, bool is_array); + bool depth_compare, bool is_array, bool is_aoffi); Node4 GetTldsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type, bool is_array); @@ -757,9 +759,11 @@ private: Tegra::Shader::TextureType texture_type, bool depth_compare, bool is_array, bool lod_bias_enabled, std::size_t max_coords, std::size_t max_inputs); + std::vector<Node> GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4); + Node4 GetTextureCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type, Tegra::Shader::TextureProcessMode process_mode, std::vector<Node> coords, - Node array, Node depth_compare, u32 bias_offset); + Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi); Node GetVideoOperand(Node op, bool is_chunk, bool is_signed, Tegra::Shader::VideoType type, u64 byte_height); @@ -773,6 +777,8 @@ private: Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor); + std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor); + std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor); template <typename... T> diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index 33b071747..4505667ff 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -6,6 +6,7 @@ #include <utility> #include <variant> +#include "common/common_types.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -14,7 +15,7 @@ namespace { std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, OperationCode operation_code) { for (; cursor >= 0; --cursor) { - const Node node = code[cursor]; + const Node node = code.at(cursor); if (const auto operation = std::get_if<OperationNode>(node)) { if (operation->GetCode() == operation_code) return {node, cursor}; @@ -64,6 +65,20 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) { return nullptr; } +std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) { + // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register + // that it uses as operand + const auto [found, found_cursor] = + TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1); + if (!found) { + return {}; + } + if (const auto immediate = std::get_if<ImmediateNode>(found)) { + return immediate->GetValue(); + } + return {}; +} + std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor) { for (; cursor >= 0; --cursor) { diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp index 5e439f036..82050bd51 100644 --- a/src/video_core/textures/convert.cpp +++ b/src/video_core/textures/convert.cpp @@ -10,6 +10,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" +#include "video_core/surface.h" #include "video_core/textures/astc.h" #include "video_core/textures/convert.h" diff --git a/src/video_core/textures/convert.h b/src/video_core/textures/convert.h index 07cd8b5da..12542e71c 100644 --- a/src/video_core/textures/convert.h +++ b/src/video_core/textures/convert.h @@ -5,7 +5,10 @@ #pragma once #include "common/common_types.h" -#include "video_core/surface.h" + +namespace VideoCore::Surface { +enum class PixelFormat; +} namespace Tegra::Texture { diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 93ecc6e31..bea0d5bc2 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -7,9 +7,7 @@ #include <array> #include "common/assert.h" #include "common/bit_field.h" -#include "common/common_funcs.h" #include "common/common_types.h" -#include "video_core/memory_manager.h" namespace Tegra::Texture { |
